-
Notifications
You must be signed in to change notification settings - Fork 5
/
import_descriptions.py
166 lines (139 loc) · 5.46 KB
/
import_descriptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/python
import re
import pywikibot
from pywikibot import textlib
from pywikibot.pagegenerators import (
GeneratorFactory,
PreloadingEntityGenerator,
PreloadingGenerator,
SearchPageGenerator,
WikidataSPARQLPageGenerator,
)
from query_store import QueryStore
from wikidata import WikidataEntityBot
class BaseDescriptionBot(WikidataEntityBot):
def __init__(self, **kwargs):
self.available_options.update({
'min_words': 2,
})
super().__init__(**kwargs)
self.FORMATTING_REGEX = re.compile("'{5}|'{2,3}")
self.REF_REGEX = re.compile(r'<ref.*?(>.*?</ref|/)>')
def get_regex_for_title(self, escaped_title):
pattern = fr'^\*+ *\[\[({escaped_title})(?:\|[^][]+)?\]\]'
pattern += r' *(?:\([^)]+\))?'
pattern += '(?:,| [-–]) *(.*)$'
return re.compile(pattern, re.M)
@staticmethod
def handle_link(match):
text = match[2]
if text:
return text.lstrip('|').strip()
else:
return match['title'].strip()
def validate_description(self, desc):
return (bool(desc) and len(desc.split()) >= self.opt['min_words'])
def parse_description(self, text):
desc = textlib.removeDisabledParts(
text,
['comment', 'file', 'nowiki', 'template', self.FORMATTING_REGEX,
self.REF_REGEX])
desc = LINK_REGEX.sub(self.handle_link, desc)
desc = desc.replace(' ', ' ').strip()
desc = re.sub(r' *\([^)]+\)$', '', desc)
desc = desc.partition(';')[0]
desc = re.sub(r'^.*\) [-–] +', '', desc)
desc = re.sub(r'^\([^)]+\) +', '', desc)
while ' ' * 2 in desc:
desc = desc.replace(' ' * 2, ' ')
if re.search(r'[^IVX]\.$', desc) or desc.endswith(tuple(',:')):
desc = desc[:-1].rstrip()
if desc.startswith(('a ', 'an ')):
desc = desc.partition(' ')[2]
return desc
def get_summary(self, page, desc):
link = page.title(as_link=True, insite=self.repo)
return f'importing [{page.site.lang}] description "{desc}" from {link}'
class MissingDescriptionBot(BaseDescriptionBot):
use_from_page = False
def __init__(self, **kwargs):
self.available_options.update({
'allpages': False,
})
super().__init__(**kwargs)
self.store = QueryStore()
@property
def generator(self):
query = self.store.build_query(
'missing_descriptions',
hostname=self.site.hostname(),
lang=self.site.lang)
return PreloadingEntityGenerator(
WikidataSPARQLPageGenerator(query, site=self.repo))
def treat_page_and_item(self, page, item):
if self.site.lang in item.descriptions:
return
title = item.getSitelink(self.site)
link_start = re.escape('[[' + title)
search_query = fr'linksto:"{title}" insource:/\* *{link_start}/'
regex = self.get_regex_for_title(re.escape(title))
for ref_page in PreloadingGenerator(
SearchPageGenerator(search_query, namespaces=[0])):
# todo: first polish text
match = regex.search(ref_page.text)
if not match:
continue
if not self.opt['allpages'] and not ref_page.isDisambig():
continue
desc = self.parse_description(match[2])
if not self.validate_description(desc):
continue
summary = self.get_summary(ref_page, desc)
item.descriptions[self.site.lang] = desc.strip()
if self.user_edit_entity(item, summary=summary):
break
class MappingDescriptionBot(BaseDescriptionBot):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.regex = self.get_regex_for_title(r'[^\[\|\]]+')
def get_pages_with_descriptions(self, text):
data = {}
for match in self.regex.finditer(text):
title, desc = match.groups()
page = pywikibot.Page(self.site, title)
data[page] = self.parse_description(desc)
return data
def treat_page(self):
page = self.current_page
descriptions = self.get_pages_with_descriptions(page.text)
for item in PreloadingEntityGenerator(descriptions.keys()):
if self.site.lang in item.descriptions:
continue
target = pywikibot.Page(item.sitelinks[self.site])
desc = descriptions.get(target)
if not self.validate_description(desc):
continue
summary = self.get_summary(page, desc)
item.descriptions[self.site.lang] = desc.strip()
self.current_page = item
self.user_edit_entity(item, summary=summary)
def main(*args):
options = {}
local_args = pywikibot.handle_args(args)
site = pywikibot.Site()
genFactory = GeneratorFactory(site=site)
for arg in genFactory.handle_args(local_args):
if arg.startswith('-'):
arg, sep, value = arg.partition(':')
if value != '':
options[arg[1:]] = int(value) if value.isdigit() else value
else:
options[arg[1:]] = True
generator = genFactory.getCombinedGenerator(preload=True)
if generator:
bot = MappingDescriptionBot(generator=generator, site=site, **options)
else:
bot = MissingDescriptionBot(site=site, **options)
bot.run()
if __name__ == '__main__':
main()