forked from bfsujason/bertalign
-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_en_text.py
110 lines (84 loc) · 3.63 KB
/
process_en_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
import itertools
import nltk
from helper import cat_by_lineno
from helper import match_lineno_seg
from helper import PAGINATION_TOKEN
from difflib import SequenceMatcher
# GLOBAL CONSTANTS
# INDEX_TOKEN = '...'
def score_by_nltk(prevline: str, nextline: str) -> int:
# 加入nltk的条件,太长会严重影响性能,限制前一句最多100字符
score = 0
nextline2Bjoined = nextline[:100]
joined = prevline[-100:] + ' ' + nextline2Bjoined
tokenized_by_nltk = nltk.sent_tokenize(joined)
if len(tokenized_by_nltk) == 1:
score += 200
elif len(tokenized_by_nltk) >= 2:
# 遍历结果,找到一个ratio和第二句差不多的
maxratio = 0
for token in reversed(tokenized_by_nltk):
sm = SequenceMatcher(lambda x: x==' ', token, nextline2Bjoined, autojunk=True) # 0.6->0 0.9->200
if sm.real_quick_ratio() < maxratio or sm.quick_ratio() < maxratio:
continue
maxratio = max(maxratio, sm.ratio())
score -= (maxratio - 0.6) * 666.7 # * 200 / 0.3
# s1, s2 = tokenized_by_nltk
# if s1 == prevline and s2 == nextline:
# score -= 200
# if is_likely(s1, outputs[-1]) and is_likely(s2, nextline):
# score -= 200
return score
def score_simple(prevline: str, nextline: str) -> int:
score = 0 # 正表示删换行,负表示保留换行
if prevline[-1] in ('.', '?', '!', ';'): # 标点
score -= 44
if prevline[-1] == ',':
score += 81
score += min(60, len(prevline)) - 32 # 长度
if nextline[0].islower(): # 小写
score += 83
return score
def score_special(prevline: str, nextline: str) -> int:
INF = 998244353
if (not nextline) or (not prevline): # 当两行中一行是空行,则拼接
return INF
if match_lineno_seg(nextline): # 避免和cat_by_lineno规则冲突
return -INF
return 0
def extract_sentences_from_single_file(filetext: list[str]) -> str:
"""
此函数会尝试把属于单个文件里的意外被换行符断开的句子恢复回来,
并且过滤掉部分分页带来的冗余信息。
返回的字符串是整个文件已经去除了分页信息的文本串
为了保证规则准确性,输入应该按文本的每行事先做好strip
Args:
filetext (list[str]): 按页分开的,来自于同一个文件的文本串
Returns:
str: 按如上描述清洗后的文本串
Example:
>>> extract_sentences_from_single_file(["Everything seemed to be\nalright.", "Cause you gave\nme whispers of\nlove all night."])
"Everything seemed to be alright.\nCause you gave me whispers of love all night."
"""
# 根据观察,有至少三个因素影响一行结尾的回车能不能被删掉
# 1. 次行首字母是不是小写字母
# 2. 本行末尾字符是不是句号
# 3. 本行是不是约有50个字符
flatten: list[str] = cat_by_lineno(filetext)
outputs = [flatten[0]]
for lineid, nextline in enumerate(flatten[1:]):
# prevline = outputs[-1]
prevline = flatten[lineid]
score = score_special(prevline, nextline) # 特判的运行优先级要高于一般规则(否则会Runtime Error)
if score == 0:
score += score_simple(prevline, nextline) # TODO: 单元测试
score += score_by_nltk(prevline, nextline)
if score > 0:
outputs[-1] += ' ' + nextline
else:
outputs.append(nextline)
output = '\n'.join(outputs)
return output
def start(text):
return extract_sentences_from_single_file(text.split(PAGINATION_TOKEN))