Skip to content

Commit

Permalink
fix logging
Browse files Browse the repository at this point in the history
  • Loading branch information
markus583 committed Jan 2, 2024
1 parent 69497e1 commit 78e2a43
Show file tree
Hide file tree
Showing 23 changed files with 86 additions and 43 deletions.
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_100k.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_1e-5.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
43 changes: 43 additions & 0 deletions configs/xlmr_stratify_0.1_3layers_400k_3e-4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"model_name_or_path": "xlm-roberta-base",
"output_dir": "xlmr-normal-400k-3e-4",
"train_text_path": "data/sentence/train.parquet",
"valid_text_path": "data/sentence/valid.parquet",
"block_size": 512,
"use_bert": true,
"do_train": true,
"do_eval": true,
"evaluation_strategy": "steps",
"per_device_train_batch_size": 32,
"per_device_eval_batch_size": 32,
"gradient_accumulation_steps": 2,
"eval_accumulation_steps": 8,
"dataloader_num_workers": 4,
"preprocessing_num_workers": 32,
"learning_rate": 3e-4,
"save_strategy": "steps",
"fp16": false,
"max_steps": 400000,
"save_steps": 100000,
"eval_steps": 5000,
"logging_steps": 50,
"report_to": "wandb",
"is_decoder": false,
"remove_unused_columns": false,
"lookahead": null,
"one_sample_per_line": false,
"do_sentence_training": true,
"do_auxiliary_training": true,
"warmup_steps": 5000,
"adapter_warmup_steps": 0,
"adapter_lr_multiplier": 1,
"ngram_order": 1,
"non_punctuation_sample_ratio": 0.1,
"prediction_loss_only": true,
"use_auxiliary": true,
"ddp_timeout": 3600,
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "warning"
}
4 changes: 2 additions & 2 deletions configs/xlmr_stratify_0.1_3layers_bs128.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"learning_rate": 1e-4,
"save_strategy": "steps",
"fp16": false,
"max_steps": 2000000,
"max_steps": 200000,
"save_steps": 100000,
"eval_steps": 5000,
"logging_steps": 50,
Expand All @@ -40,5 +40,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_bs128_no_aux.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_bs256.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_bs256_no_aux.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_bs64.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_bs64_no_aux_400k.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_highlr.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_no_aux.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_nounks.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_shorter.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_stride128_400k.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_3layers_stride32_400k.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmr_stratify_0.1_6layers.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 6,
"custom_punctuation_file": "punctuation_xlmr_unk.txt",
"log_level": "info"
"log_level": "warning"
}
2 changes: 1 addition & 1 deletion configs/xlmv_stratify_0.1_3layers.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@
"use_subwords": true,
"num_hidden_layers": 3,
"custom_punctuation_file": "punctuation_xlmv_unk.txt",
"log_level": "info"
"log_level": "warning"
}
4 changes: 2 additions & 2 deletions wtpsplit/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from wtpsplit.utils import Constants, hash_encode

# logger = logging.getLogger(__name__)
logger = logging.getLogger(__name__)


class ORTWrapper:
Expand Down Expand Up @@ -224,7 +224,7 @@ def extract(
)["logits"]
if use_subwords:
logits = logits[:, 1:-1, :] # remove CLS and SEP tokens
# logger.debug(np.max(logits[0, :, 0]))
logger.debug(np.max(logits[0, :, 0]))

for i in range(start, end):
original_idx, start_char_idx, end_char_idx = locs[i]
Expand Down
3 changes: 3 additions & 0 deletions wtpsplit/train/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import numpy as np
import pysbd
import sklearn.metrics
import logging

from wtpsplit.extract import extract, PyTorchWrapper
from wtpsplit.utils import Constants

logger = logging.getLogger(__name__)


def compute_iou(a, b):
return len(set(a) & set(b)) / len(set(a) | set(b))
Expand Down
24 changes: 11 additions & 13 deletions wtpsplit/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ def setup_logging(training_args: transformers.TrainingArguments) -> None:
# logger.info(f"Training/evaluation parameters {training_args}")




@dataclass
class Args:
model_name_or_path: str
Expand Down Expand Up @@ -303,7 +301,7 @@ def main():
)

if training_args.local_rank == 0:
logger.info(summary(model, depth=4))
logger.warning(summary(model, depth=4))
# backbone.push_to_hub("markus583/xlm-token-untrained", private=True)

def prepare_dataset(
Expand All @@ -315,7 +313,7 @@ def prepare_dataset(
with training_args.main_process_first():
dlconf = DownloadConfig(cache_dir="/home/Markus/.cache/huggingface/datasets")
dataset = load_dataset("markus583/mC4-TEST", split=split, download_config=dlconf)
logger.info(f"Loaded {split} dataset.")
logger.warning(f"Loaded {split} dataset.")
# optional: delete downloaded dataset, it is stored in cache_dir now (but we delete it later)
# ~40GB on disk
# os.system("rm -rf /home/Markus/.cache/huggingface/datasets")
Expand All @@ -327,11 +325,11 @@ def prepare_dataset(
lambda example: example["lang"] in include_languages,
num_proc=args.preprocessing_num_workers,
)
logger.info(f"Filtered to {len(dataset)} examples.")
logger.warning(f"Filtered to {len(dataset)} examples.")

if shuffle:
dataset = dataset.shuffle(seed=42)
logger.info("Shuffled dataset.")
logger.warning("Shuffled dataset.")

# very likely not relevant / used only for the compound part
if args.ignore_non_hyphen:
Expand Down Expand Up @@ -523,14 +521,14 @@ def maybe_pad(text):
# this is no longer used and would cause an error otherwise
with training_args.main_process_first():
dataset = dataset.remove_columns([args.text_column])
logger.info(f"Tokenized {split} dataset.")
logger.warning(f"Tokenized {split} dataset.")

if split == "train":
with training_args.main_process_first():
for root, dirs, files in os.walk(os.environ.get("HF_DATASETS_CACHE")):
for file in files:
if file.startswith("m_c4-test-train"):
logger.info(f"Removing {os.path.join(root, file)}")
logger.warning(f"Removing {os.path.join(root, file)}")
os.remove(os.path.join(root, file))

if not args.one_sample_per_line:
Expand All @@ -542,7 +540,7 @@ def maybe_pad(text):
# a bit hacky but oh well, only drop if sentence
remove_columns=["ends_with_punctuation"] if args.text_column == "text" else [],
)
logger.info(f"Grouped {split} dataset.")
logger.warning(f"Grouped {split} dataset.")

return dataset

Expand All @@ -552,15 +550,15 @@ def maybe_pad(text):
shuffle=False,
split="valid",
)
logger.info(f"Valid dataset has {len(valid_dataset)} examples.")
logger.warning(f"Valid dataset has {len(valid_dataset)} examples.")

train_dataset = prepare_dataset(
num_workers=args.preprocessing_num_workers,
include_languages=args.include_languages,
shuffle=args.shuffle,
split="train",
)
logger.info(f"Train dataset has {len(train_dataset)} examples.")
logger.warning(f"Train dataset has {len(train_dataset)} examples.")

# print some samples from the dataset
count = 0
Expand All @@ -569,9 +567,9 @@ def maybe_pad(text):
sample = train_dataset[index]

if sample.get("lang") == "de":
logger.info(f"Sample {index} of the training set: {sample}.")
logger.warning(f"Sample {index} of the training set: {sample}.")
if tokenizer:
logger.info(tokenizer.decode(sample["input_ids"]))
logger.warning(tokenizer.decode(sample["input_ids"]))
count += 1

eval_data = torch.load(
Expand Down
14 changes: 7 additions & 7 deletions wtpsplit/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,12 +245,12 @@ def evaluation_loop(

batch_size = self.args.eval_batch_size

logger.info(f"***** Running {description} *****")
logger.warning(f"***** Running {description} *****")
if has_length(dataloader):
logger.info(f" Num examples = {self.num_examples(dataloader)}")
logger.warning(f" Num examples = {self.num_examples(dataloader)}")
else:
logger.info(" Num examples: Unknown")
logger.info(f" Batch size = {batch_size}")
logger.warning(" Num examples: Unknown")
logger.warning(f" Batch size = {batch_size}")

model.eval()

Expand Down Expand Up @@ -415,10 +415,10 @@ def evaluation_loop(
metrics=metrics,
num_samples=num_samples,
)

def _save_tpu(self, output_dir: Optional[str] = None):
output_dir = output_dir if output_dir is not None else self.args.output_dir
logger.info(f"Saving model checkpoint to {output_dir}")
logger.warning(f"Saving model checkpoint to {output_dir}")

if xm.is_master_ordinal():
os.makedirs(output_dir, exist_ok=True)
Expand All @@ -440,7 +440,7 @@ def _save_tpu(self, output_dir: Optional[str] = None):
save_function=xm.save,
)
else:
logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
logger.warning("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
state_dict = actual_model.state_dict()
xm.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
else:
Expand Down
5 changes: 2 additions & 3 deletions wtpsplit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def get_subword_label_dict(label_args, tokenizer):

n_unks = 0
# Map auxiliary characters to token IDs with labels
logger.info(f"Using {Constants.PUNCTUATION_CHARS} auxiliary characters.")
logger.warning(f"Using {Constants.PUNCTUATION_CHARS} auxiliary characters.")
for i, c in enumerate(Constants.PUNCTUATION_CHARS):
token_id = tokenizer.convert_tokens_to_ids(c)
label_dict[token_id] = 1 + Constants.AUX_OFFSET + i
Expand Down Expand Up @@ -216,8 +216,7 @@ def corrupt(
while (
last_index_in_block + 1 == len(block_ids)
or last_index_in_block < len(block_ids)
and block_ids[last_index_in_block + 1]
== block_ids[last_index_in_block]
and block_ids[last_index_in_block + 1] == block_ids[last_index_in_block]
):
last_index_in_block += 1
input_ids.insert(last_index_in_block, 0)
Expand Down

0 comments on commit 78e2a43

Please sign in to comment.