-
Notifications
You must be signed in to change notification settings - Fork 0
/
tatoeba_prepare.py
49 lines (43 loc) · 1.37 KB
/
tatoeba_prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import argparse
import string
from tqdm.auto import tqdm
alphabets = string.ascii_letters + string.digits
def filter_line(line):
try:
cond = [
"..." in line,
line.upper() == line,
line.lower() == line,
re.sub('[,?.]', '', line) == line,
re.sub(f'[{alphabets}]', '', line) == line,
len(line) == 0,
line[-1] not in [".", "?", "|", "!"],
not line[0].isupper(),
]
if any(cond):
return True
else:
return False
except:
return False
def parse_arguments():
parser = argparse.ArgumentParser(description='Prepare Tatoeba')
parser.add_argument('--tsv_fn', type=str, help='TSV Data')
parser.add_argument('--op_txt_fn', default="none", type=str, help='Output file name')
args = parser.parse_args()
return args
args = parse_arguments()
tsv_fn = args.tsv_fn
sent = []
with open(tsv_fn, 'r') as f:
data = f.read().split("\n")
for line in tqdm(data, desc="reading data"):
line = line.split("\t")[-1].strip()
if not filter_line(line):
sent.append(line)
orig_len = len(sent)
sent = list(set(sent))
final_len = len(sent)
print(f"Original data length: {orig_len:,} - After removing duplicates: {final_len:,}")
with open(args.op_txt_fn, 'w') as f:
f.write("\n".join(sent))