-
Notifications
You must be signed in to change notification settings - Fork 5
/
tools.py
87 lines (72 loc) · 2.86 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import pywikibot
from pywikibot.tools.chars import url2string
FULL_ARTICLE_REGEX = r'\A[\s\S]*\Z'
class FileRegexHolder:
replaceR = None
FLOAT_PATTERN = r'\d+(?:\.\d+)?'
@classmethod
def get_regex(cls, site):
if not cls.replaceR:
magic = ['img_baseline', 'img_border', 'img_bottom', 'img_center',
'img_class', 'img_framed', 'img_frameless', 'img_left',
'img_middle', 'img_none', 'img_right', 'img_sub',
'img_super', 'img_text_bottom', 'img_text_top',
'img_thumbnail', 'img_top']
words = []
for magicword in magic:
words.extend(site.getmagicwords(magicword))
replace = '|'.join(map(re.escape, words))
for magicword in site.getmagicwords('img_manualthumb'):
replace += '|' + magicword.replace('$1', cls.FLOAT_PATTERN)
for magicword in site.getmagicwords('img_upright'):
replace += '|' + magicword.replace('$1', cls.FLOAT_PATTERN)
for magicword in site.getmagicwords('img_width'):
replace += '|' + magicword.replace('$1', r'\d+')
cls.replaceR = re.compile(replace)
return cls.replaceR
def deduplicate(arg):
# todo: merge with filter_unique?
for index, member in enumerate(arg, start=1):
while member in arg[index:]:
arg.pop(arg.index(member, index))
def parse_image(text, site):
# TODO: merge with .migrate_infobox.InfoboxMigratingBot.handle_image
image = caption = None
imgR = re.compile(r'\[\[\s*(?:%s) *:' % '|'.join(site.namespaces[6]),
flags=re.I)
if imgR.match(text):
split = text.rstrip()[:-2].split('|')
matchR = FileRegexHolder.get_regex(site)
while split[1:]:
tmp = split.pop().strip()
if not matchR.fullmatch(tmp):
caption = tmp
break
if caption:
while caption.count('[') != caption.count(']'):
caption = split.pop() + '|' + caption
caption = caption.rstrip('.').strip()
image = split[0].partition(':')[2].rstrip(']')
image = url2string(image)
image = re.sub('[ _]+', ' ', image).strip()
return image, caption
def get_best_statements(statements):
best = []
best_rank = 'normal'
for st in statements:
if st.rank == best_rank:
best.append(st)
elif st.rank == 'preferred':
best[:] = [st]
best_rank = st.rank
return best
def iter_all_snaks(data):
for claims in data.values():
for claim in claims:
yield claim
for snaks in claim.qualifiers.values():
yield from snaks
for ref in claim.sources:
for snaks in ref.values():
yield from snaks