-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_wura_tokenizer.py
129 lines (102 loc) · 3.24 KB
/
train_wura_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Script for training language-dedicated tokenizers on
language subsets of WURA.
Written by Andrzej Szablewski, 2024 as a part of the UCL Final Year Project
"Language Model Adaptation for Low-Resource African Languages".
"""
import argparse
import multiprocessing
import os
import logging
from transformers import AutoTokenizer
from datasets import concatenate_datasets
from tokenization import (
prepare_wura_dataset,
batch_sample_generator,
train_new_tokenizer,
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Train a tokenizer for a chosen language(s) from wura dataset."
)
parser.add_argument(
"--langs",
type=str,
required=True,
help="A comma-separated list of languages from wura dataset to train a tokenizer in.",
)
parser.add_argument(
"--old_tokenizer",
type=str,
required=True,
help="Name of a tokenizer to use a base.",
)
parser.add_argument(
"--output_dir",
type=str,
required=True,
help="Path to a directory where to store the trained tokenizer.",
)
parser.add_argument(
"--sample",
action="store_true",
help="Name of a tokenizer to use a base.",
)
parser.add_argument(
"--sample_size",
type=int,
help="Sample size.",
)
parser.add_argument(
"--token_count",
type=int,
default=None,
help="Number of tokens in a new tokenizer. Do not provide to keep the size of the original GPT2 tokenizer.",
)
args = parser.parse_args()
# NOTE: Sanity checks which are currently missing
return args
def main():
args = parse_args()
lang_list = args.langs.split(",")
ds_dict = prepare_wura_dataset(
"./data/wura",
lang_list,
sample=args.sample,
lang_sample_size=args.sample_size,
combine_headline_content=True,
num_proc=multiprocessing.cpu_count(),
)
ds = concatenate_datasets([ds["train"] for ds in ds_dict.values()]).shuffle(42)
old_tokenizer = AutoTokenizer.from_pretrained(args.old_tokenizer)
logging.info(
f"Using {old_tokenizer.__class__.__name__} and split of wura dataset in {args.langs}."
)
training_ds = batch_sample_generator(ds)
new_tokenizer = train_new_tokenizer(
old_tokenizer, training_ds, new_token_count=args.token_count
)
prompt = "Hello World! This is an example of tokenization."
print("--- Example Tokenization ---")
print(
f"Old tokenizer {args.old_tokenizer}:",
[old_tokenizer.decode([token]) for token in old_tokenizer(prompt)["input_ids"]],
old_tokenizer(prompt)["input_ids"],
sep="\n",
end="\n\n",
)
print(
f"New tokenizer for {args.langs}:",
[new_tokenizer.decode([token]) for token in new_tokenizer(prompt)["input_ids"]],
new_tokenizer(prompt)["input_ids"],
sep="\n",
)
new_tokenizer.save_pretrained(
os.path.join(
args.output_dir,
f"{args.sample_size if args.sample else 'full'}-{args.langs}-{args.old_tokenizer}",
)
)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()