-
Notifications
You must be signed in to change notification settings - Fork 0
/
readData.py
87 lines (74 loc) · 2.12 KB
/
readData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: UTF-8 -*-
import urllib2
import re
#urlname = lambda idx: "http://genealogy.math.ndsu.nodak.edu/id.php?id=%d" % idx
urlname = lambda idx: "http://www.genealogy.ams.org/id.php?id=%d" % idx
html_escaped = {
"Ä": u'Ä',
"Ö": u'Ö',
"Ü": u'Ü',
"ä": u'ä',
"ö": u'ö',
"ü": u'ü',
"ß": u'ß',
"À": u'À',
"È": u'È',
"Ò": u'Ò',
"Ù": u'Ù',
"à": u'à',
"è": u'è',
"ò": u'ò',
"ù": u'ù',
"Á": u'Á',
"É": u'É',
"Ó": u'Ó',
"Ú": u'Ú',
"á": u'á',
"é": u'é',
"ó": u'ó',
"ú": u'ú',
"&": u'&',
"Æ": u'Æ',
"æ": u'æ',
"ń": u'ń',
}
degree_str = '<span style="margin-right: 0.5em">'
advisor_str = '<p style="text-align: center; line-height: 2.75ex">'
def unescape(s):
for old, new in html_escaped.iteritems():
s = s.replace(old, new)
return re.sub("\s+", " ", s)
def tagpart(text, delim):
a, dummy, b = text.partition(delim)
return a.strip(), b
def fetchPage(idx):
f = urllib2.urlopen(urlname(idx))
if not f:
return ""
text = f.read().decode('utf-8')
f.close()
return unescape(text)
def readPhD(idx, text):
return (idx, re.search("<h2.*?>(.*?)</h2>", text).group(1).strip())
def readThesis(idx, text):
text = text.partition(degree_str)[2]
degree, text = tagpart(text, "<span style=\"color: #006633; margin-left: 0.5em\">")
school, text = tagpart(text, "</span>")
year, text = tagpart(text, "</span>")
title, text = tagpart(text.partition("<span style=\"font-style:italic\" id=\"thesisTitle\">")[2], "</span>")
return (degree, year, title, school)
def readDegree(idx, text):
mo = re.search('%s(.*?)</p>' % advisor_str, text)
if mo is None:
return []
text = mo.group(1)
return [(idx, int(aID)) for aID in re.findall(r'<a.*?id=(.*?)">', text)]
def readDegreeTuples(idx, text):
res = []
while degree_str in text:
res.append((readThesis(idx, text), readDegree(idx, text)))
text = text.partition(advisor_str)[2]
return res
def readData(idx):
text = fetchPage(idx)
return readPhD(idx, text), readDegreeTuples(idx, text)