-
Notifications
You must be signed in to change notification settings - Fork 2
/
UserListGenerator.py
369 lines (312 loc) · 12.1 KB
/
UserListGenerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pywikibot
import re, string
import math, sys
MAX_USERNAME_LENGTH = 31
RE_LINKS = re.compile('(\[\[[Uu]ser *: *(.{1,%i}?) *(?:\| *(.+?) *)?\]\])' % MAX_USERNAME_LENGTH)
RE_HTMLCOMMENT = re.compile("\<\!\-\-.*?\-\-\>", re.DOTALL)
# anything within a [[User:x|y]] style link
re_userlink = '\[\[[Uu]ser\s*:\s*(.{1,%i}?)\s*(?:\|\s*(?:.+?)\s*)?\]\]' % MAX_USERNAME_LENGTH
re_userstring = '([^,&\n\-\[\|\(\)]{1,%i})' % MAX_USERNAME_LENGTH
# the same, or just an arbitrary string
re_maybelink = '(?:'+re_userlink+'|'+re_userstring+')'
# any enumerator
re_enumerator = "(?:,| and | with |&)"
# a sequence of re_maybelinks, separated by re_enumerators
re_maybelist = re_maybelink+'(?:'+re_enumerator+re_maybelink+')*'
# a sequence of re_userlinks, separated by re_enumerators
re_strictlylist = '(?:'+re_userlink+'(?:'+re_enumerator+re_userlink+')+)'
# either a re_userlink, or a re_strictlylist
re_linkorlist = '(?:'+re_strictlylist+'|'+re_userlink+')'
# an option to a wiki template
re_option = '\s*([^=]+?)(?:\}|\|\s*\w+\s*=)'
RE_USERLINK = re.compile(re_userlink)
RE_LISTED = re.compile('\s*[\*]\s*('+re_maybelist+')[^\n]*')
RE_LISTEDLINK = re.compile('\s*[\*].*?('+re_linkorlist+')[^\n]*')
RE_RIBBONBEARER = re.compile('\{\{.*?\|\s*name\s*='+re_option, re.DOTALL)
RE_CARDRECIPIENT = re.compile('recipient ?='+re_option)
RE_ENTITLED = re.compile('==+\s*('+re_linkorlist+')\s*=+=')
RE_MEETUP = re.compile('\{\{\s*[Mm]eet-up.*?\|\s*name\s*='+re_option, re.DOTALL)
RE_FIRST = re.compile('^.*?'+re_userlink, re.DOTALL)
RE_COMMONPLACES = re.compile('(?:reached by)\s+('+re_maybelist+')\s*\.')
RE_BOLDED = re.compile('\\\'{3}('+re_maybelist+')') #does not work!
RE_PARALIST = re.compile('\n\n('+re_maybelink+').*?(?=\n\n)', re.MULTILINE ^ re.DOTALL)
improbablenames = ["", " ", "a", "and", "i", "i'll", "we", "the", "one", "two", "three", "all of us", "all attendees", "everyone", "his", "her", "probably", "drag", "drag-along", "1", "2", "3", "4", "5", "wife", "family", "friends", "probably."]
debug_fuzz = None
debug_links = None
def getDebugFuzz():
global debug_fuzz
return debug_fuzz
def getDebugLinks():
global debug_links
return debug_links
def normalize(dic):
maxfuzz = 0
for p,v in dic.items():
if len(p)==0:
del dic[p]
continue
if v>maxfuzz:
maxfuzz=v
if maxfuzz>0:
for p,v in dic.items():
dic[p]=v/maxfuzz
return dic
def unscorify(word):
return word.replace("_"," ")
def splitgrouped(word):
fail = re.findall("\[User:[^]]+"+re_enumerator+".*?\]", word)
if fail: #TODO: be smarter when splitting this
return [word]
return re.split(re_enumerator, word)
def flatten(l, ltypes=(list, tuple)):
"""flatten an array or list"""
ltype = type(l)
l = list(l)
i = 0
while i < len(l):
while isinstance(l[i], ltypes):
if not l[i]:
l.pop(i)
i -= 1
break
else:
l[i:i + 1] = l[i]
i += 1
return ltype(l)
def identifyParticipants(origtext, page, getLinks = False, getSections = True):
global debug_fuzz
global debug_links
#pywikibot.output("===" + page + "===")
fuzzy = {} #user id -> probability of being a participant
text = RE_HTMLCOMMENT.sub("",origtext)
text = unscorify(text)
pseudonyms = {}
userlinks = {}
usernames = {}
if "[[Category:Not reached - Did not attempt]]" in text:
pywikibot.output("Ignoring participants because expedition wasn't attempted")
return []
if "[[Category:Tagged for deletion]]" in text:
pywikibot.output("Ignoring participants because expedition page is marked for deletion")
return []
if len(re.findall("\{\{\s*delete", text)) > 0:
pywikibot.output("Ignoring participants because expedition page is marked for deletion")
return []
scoring = [
(RE_USERLINK, 1),
(RE_RIBBONBEARER, 10),
(RE_CARDRECIPIENT, -5),
(RE_ENTITLED, 20),
(RE_MEETUP, 10),
(RE_COMMONPLACES, 1),
# (RE_BOLDED, 1),
]
if getSections:
sections = getSectionRegex(text, "(participants?|(the\s)?people|(?<!intended )attend[esanc]+|adventurers?|geohashers?|reached)\??", True)
if sections:
scoring.append((RE_LISTED, 3))
scoring.append((RE_LISTEDLINK, 4))
scoring.append((RE_PARALIST, 1))
text = sections
else:
scoring.append((RE_FIRST, 5))
# identify pseudonyms, and user links
links = RE_LINKS.findall(text)
for part in links:
pseudonyms[part[0].lower()] = part[1].lower()
userlinks [part[1].lower()] = part[0]
usernames [part[0].lower()] = part[1]
if not part[2].lower() in improbablenames:
pseudonyms[part[2].lower()] = part[1].lower()
usernames [part[2].lower()] = part[1]
userlinks [part[2].lower()] = part[0]
for rex, score in scoring:
match = flatten(rex.findall(text))
for group in match:
parts = splitgrouped(group)
for part in parts:
partls = part.lower().strip()
if not partls in improbablenames:
if partls in pseudonyms:
fuzzy[pseudonyms[partls]]=fuzzy.get(pseudonyms[partls],0) + score
else:
fuzzy[partls]=fuzzy.get(partls,0) + score
usernames[partls] = part.strip()
if partls not in userlinks:
userlinks[partls] = part.strip()
#increase the score of a potential participant by the number of mentions vs total mentions
mentions = {}
mcount = 0.0
for p in fuzzy.keys():
mentions[p] = len(re.findall(re.escape(p), origtext, re.IGNORECASE))
mcount += mentions[p]
for p in pseudonyms.keys():
if p not in fuzzy.keys():
pseudo_mentions = len(re.findall(re.escape(p), origtext, re.IGNORECASE)) + len(re.findall(re.escape(p), pseudonyms[p], re.IGNORECASE))
if RE_USERLINK.match(p):
mentions_per_link = len(re.findall(re.escape(pseudonyms[p]), p, re.IGNORECASE))
pseudo_mentions -= pseudo_mentions * mentions_per_link
mentions[pseudonyms[p]] = mentions.get(pseudonyms[p],0) + pseudo_mentions
mcount += pseudo_mentions
if mcount>0:
for p,v in mentions.items():
fuzzy[p]=fuzzy.get(p,0) + (v/mcount)*2
if len(fuzzy)==0: #only if we still don't have fuzz
if getSections:
return identifyParticipants(origtext, page, getLinks, getSections = False)
if len(fuzzy)==0: #only if we still don't have fuzz
history = page.revisions()
#compare the edit history with the page content
editors = [change.user for change in history]
for editor in editors:
if editor.lower() in text.lower():
fuzzy[editor]=0.5
if len(fuzzy)==0: #only if we still don't have fuzz
wlh = [r for r in page.getReferences()]
#get user pages from the reference counter
for l in wlh:
if "User:" in l.title():
fuzzy[l.title()[5:]]=0.5
if len(fuzzy)>1: #but not too much, I say
fuzzy = {}
fuzzy = normalize(fuzzy)
participants = []
for p,v in fuzzy.items():
if p in improbablenames:
continue
if v>=0.30:
participants.append(p)
debug_fuzz = fuzzy
debug_links = userlinks
if getLinks:
return [userlinks.get(p,p) for p in participants] #that is: return a list of [userlinks[p] if it exists, else return p]
else:
return [usernames.get(p,p) for p in participants]
def getUsers(page):
"""
returns a list of expeditions participants found in the text of a geohashing expedition page.
ingredients: one wikipedia.Page object
"""
text = page.get()
title = page.title()
wikipedia.output(u'Parsing %s...' % title)
if(text[0] == u"="): # a hack?
text = u"\n" + text
if(text[1] == u"="):
text = u"\n" + text
#Generate the list of people
#First look in appropriately named "who" sections
peopleSecText = getSectionRegex(text, "(participants?|people)\??")
if(peopleSecText != None):
peopleText = getPeopleText(text, peopleSecText)
#If that fails, look for all unique [[User:*]] tags in the expedition page
if((peopleSecText == None) or (len(peopleText) == 0)):
peopleText = getUserList(text)
return peopleText
def getSections(text, subSects = None):
text = "\n" + text
if (subSects == None):
split_text = re.split("\n", text)
minlen = 99
for line in split_text:
match = re.match("\s*=+", line)
if ((match != None) and (len(match.group(0).strip()) < minlen)):
minlen = len(match.group(0).strip())
equal_str = u""
for i in range(0,minlen):
equal_str += u"="
regex_text = u"\n\s*" + equal_str + "([^=]*?)" + equal_str
else:
regex_text = "\n\s*=+([^=]*?)=+"
text_arr = re.split(regex_text, text)
for i in range(0,len(text_arr)):
text_arr[i] = text_arr[i].strip()
section_hash = {}
section_hash[""] = text_arr[0]
for i in range(1,len(text_arr),2):
title = text_arr[i].lower()
section_hash[title] = section_hash.get(title,"") + text_arr[i+1]
# for i in section_hash.keys():
# pywikibot.output(str(i) + ":")
# pywikibot.output(":" + section_hash[i])
return section_hash
def getSection(text, name_arr, subSects = None):
"""
This will look for a section with one of the names in name_arr
The search is case insensitive, and returns the first match, starting from name_arr[0] and continuing to name_arr[len(name_arr)-1]
It will return the body of the appropriate section, or None if there were no matches for the section name.
If subSects != None, then it will search for all subsections which match as well.
"""
sections = getSections(text, subSects)
code = ""
for header in name_arr:
if header in sections:
code += sections[header] +"\n"
if ((len(name_arr) == 0) and ("" in sections)):
return sections[""]
if len(code)>0:
return code
return None
def getSectionRegex(text, regex_text, subSects = None):
"""
This will look for a section with a name that matches the regex_text
It will return the body of the appropriate section, or None if there were no matches for the section name.
If subSects != None, then it will search for all subsections which match as well.
"""
sections = getSections(text, subSects)
if ((regex_text == None) and ("" in sections)):
return sections[""]
else:
for keys in sections.keys():
if(re.search(regex_text, keys)):
return sections[keys]
return None
def getUserUist(text):
"""This will look for all unique user tags on a page, and make a list out of them."""
regex_res = re.findall("\[\[User:.*?\]\]", text, re.I)
regex_lower = []
for i in range(0,len(regex_res)):
regex_lower.append(re.sub("_", " ", regex_res[i].lower()))
regex_lower[i] = re.sub(" ?| ?", "|", regex_lower[i])
regex_lower[i] = re.sub("'s", "", regex_lower[i])
result_arr = []
for i in range(0,len(regex_lower)):
for j in range(i+1,len(regex_lower)):
if (regex_lower[i] == regex_lower[j]):
break
else:
result_arr.append(regex_res[i])
temp_str = u", "
return temp_str.join(result_arr)
def getPeopleText(text, people_text):
"""This function will parse a list of users, and return them in a comma separated list."""
people_text = re.sub("<!--.*?(-->|$)", "", people_text)
people_text = string.strip(re.sub("^\[[^][]*?\]", "", people_text))
people_text_arr = re.split("\n", people_text)
people_text = u""
if (len(people_text_arr[0]) == 0):
people_regex_str = re.compile("^(\[\[.*?\]\]|[^ ]*)")
elif (people_text_arr[0][0] == "*"):
people_regex_str = re.compile("^\*\s*(\[\[.*?\]\]|[^ ]*)")
elif (people_text_arr[0][0] == ":"):
people_regex_str = re.compile("^:\s*(\[\[.*?\]\]|[^ ]*)")
else:
people_regex_str = re.compile("^(\[\[.*?\]\]|[^ ]*)")
match_obj = people_regex_str.match(people_text_arr[0])
people_text += match_obj.group(1)
if(re.match("=", people_text_arr[0])):
people_text = getUserList(text)
else:
for i in range(1,len(people_text_arr)):
match_obj = people_regex_str.match(people_text_arr[i])
if ((match_obj != None) and (len(match_obj.group(1)) != 0)):
if(re.search("Category", people_text_arr[i])):
pass
elif (re.match("=", people_text_arr[i])):
pass
else:
people_text += u", "
people_text += match_obj.group(1)
return people_text