-
Notifications
You must be signed in to change notification settings - Fork 0
/
runners.py
263 lines (227 loc) · 9.4 KB
/
runners.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
"""Classes to run the other modules (input reader, EL clients and output writers)"""
import codecs
import time
import analysis as al
import clients
import model as md
import utils as ut
class RunnerManager(object):
"""
To create the runner based on config.
@ivar cfg: config values, e.g. L{config}
"""
def __init__(self, cfg):
self.cfg = cfg
def create_runner(self, linker, reader, writer):
""" Create runner based on config
@param linker: the service to create the runner for
@param reader: a L{readers} object to feed inputs to runner
@param writer: a L{writers} object to write results
"""
try:
assert linker in self.cfg.activate
except AssertionError:
print "Allowed services: {}".format(
", ".join(self.cfg.activate.keys()))
if linker == self.cfg.TNames.TM:
client = clients.TagmeClient(self.cfg)
runner = TagmeRunner(self.cfg, client, reader, writer)
elif linker == self.cfg.TNames.SP:
client = clients.SpotlightClient(self.cfg)
runner = SpotlightRunner(self.cfg, client, reader, writer)
elif linker == self.cfg.TNames.PS:
client = clients.SpotstatClient(self.cfg)
runner = SpotstatRunner(self.cfg, client, reader, writer)
elif linker == self.cfg.TNames.WD:
client = clients.WikipediaMinerClientDexter(self.cfg)
runner = WikipediaMinerRunner(self.cfg, client, reader, writer)
elif linker == self.cfg.TNames.WI:
client = clients.WikipediaMinerClientRemote(self.cfg)
runner = WikipediaMinerRunner(self.cfg, client, reader, writer)
elif linker == self.cfg.TNames.AI:
client = clients.AidaClient(self.cfg)
runner = AidaRunner(self.cfg, client, reader, writer)
elif linker == self.cfg.TNames.RA:
client = clients.AidaRemoteClient(self.cfg)
runner = AidaRemoteRunner(self.cfg, client, reader, writer)
elif linker == self.cfg.TNames.BF:
client = clients.BabelfyClient(self.cfg)
runner = BabelfyRunner(self.cfg, client, reader, writer)
else:
return False
return runner
class DefRunner(object):
"""
Using config in cfg, assigns client to cl, an input reader to rd,
and a module to postprocess results with to wr.
@ivar cfg: Config infos or L{config}
@ivar cl: A client (L{clients}) to obtain responses
@ivar rd: A reader (L{readers}) to give input to client
@ivar wr: A writer (L{writers}) to post-process responses
@ivar donefn: hash to keep track of done filenames
"""
def __init__(self, cfg, cl, rd, wr):
self.cfg = cfg
self.cl = cl
self.rd = rd
self.wr = wr
self.donefn = {}
def _run_one(self, fn, text, cpsob):
"""
Creates a request for a text, gets response and obtains annotations.
@param fn: file-name for text
@param text: text to do request for
@param cpsob: a L{model.Corpus} object
"""
print "- Running file: {}".format(fn),
pay = self.cl.create_payload(text)
print " Getting response",
try:
res = self.cl.get_response(pay)
except clients.EmptyTextException as e:
print e.args[0]["message"]
return {}, {}
time.sleep(self.cfg.waitfor)
print " Parsing annotations"
anns = al.AnnotationParser.parse(fn, self.cl, res, cpsob)
return res, anns
def run_all(self, ipt, skiplist, cpsob):
"""
Calls L{_run_one} for each item given as input, yielding
the response, parsed annotations, document object and file name
@param ipt: full path to input to run (or text-string to run)
@param skiplist: filenames (one per line) to skip, if any
"""
uts = ut.Utils(self.cfg)
cat_ind = uts.load_entity_category_indicators()
#input
fn2txt = self.rd.read(ipt)
try:
dispipt = ipt[0:100]
except IndexError:
dispipt = ipt
try:
skips = [x.strip() for x in codecs.open(
skiplist, "r", "utf8").readlines()]
except IOError:
skips = []
# run calls
print "-- [{}] RUNNING COLLECTION: {}, {}".format(self.cl.name, dispipt,
time.asctime(time.localtime()))
dones = 0
todo = self.cfg.limit
for fn in sorted(fn2txt):
if fn in skips:
print "Skipping {}".format(repr(fn))
continue
# create doc objs
dob = md.Document(fn, text=fn2txt[fn])
dob.find_sentence_positions()
# annots
try:
res, anns = self._run_one(fn, ut.Utils.norm_text(fn2txt[fn]), cpsob)
except ValueError, msg:
print "\n! Error with file: {}".format(fn)
print "\n" + msg.message
res, anns = {}, {}
uts.add_sentence_number_to_annots(anns, dob)
for link in [an.enti.link for posi, an in anns.items()]:
cpsob.normalize_entity_categories(link, cat_ind)
dones += 1
yield res, anns, dob, fn
if dones == todo:
break
def write_results(self, res, anns, fn, cpsob, runid="001", outdir=None,
outresps=None):
"""
Writes responses for en EL request, with option to
- append to a single file for all requests
- write to a separate file for each request
@param res: the client response
@param anns: parsed annotations
@param fn: filename
@param cpsob: L{model.Corpus} object
@param runid: run-id to identify output files
@param outdir: directory to output annotations
@param outresps: directory to output client responses
"""
# raw responses always to individual files
self.wr.write_raw_responses({fn: res},
self.cl.name, runid=runid, outdir=outresps)
# annotations to a single file for all corpus
if self.cfg.oneoutforall:
if len(self.donefn) >= 1:
self.wr.write_to_single({fn: anns},
self.cl.name, cpsob, runid=runid,
has_categ=self.cfg.add_categs, outdir=outdir)
else:
self.wr.write_to_single({fn: anns},
self.cl.name, cpsob, runid=runid,
has_categ=self.cfg.add_categs,
write_header=True, outdir=outdir)
# annotations to individual files for each "file"
else:
self.wr.write_to_multi({fn: anns}, self.cl.name,
cpsob, runid=runid, has_categ=self.cfg.add_categs,
outdir=outdir)
self.donefn[fn] = 1
class TagmeRunner(DefRunner):
def __init__(self, cfg, cl, rd, wr):
super(TagmeRunner, self).__init__(cfg, cl, rd, wr)
class SpotlightRunner(DefRunner):
def __init__(self, cfg, cl, rd, wr):
super(SpotlightRunner, self).__init__(cfg, cl, rd, wr)
def _run_one(self, fn, text, cpsob):
"""
See L{DefRunner}
@note: Override since no need to create payload; pyspotlight
library called by L{clients.SpotlightClient} creates it.
"""
print "- Running file: {}".format(fn),
print " Getting response",
res = self.cl.get_response(text)
print " Parsing annotations"
anns = al.AnnotationParser.parse(fn, self.cl, res, cpsob)
return res, anns
class SpotstatRunner(DefRunner):
def __init__(self, cfg, cl, rd, wr):
super(SpotstatRunner, self).__init__(cfg, cl, rd, wr)
def _run_one(self, fn, text, cpsob):
"""
See L{DefRunner}
@note: Override since no need to create payload; pyspotlight
library called by L{clients.SpotlightClient} creates it.
"""
print "- Running file: {}".format(fn),
print " Getting response",
res = self.cl.get_response(text)
print " Parsing annotations"
anns = al.AnnotationParser.parse(fn, self.cl, res, cpsob)
return res, anns
class WikipediaMinerRunner(DefRunner):
def __init__(self, cfg, cl, rd, wr):
super(WikipediaMinerRunner, self).__init__(cfg, cl, rd, wr)
class AidaRunner(DefRunner):
def __init__(self, cfg, cl, rd, wr):
super(AidaRunner, self).__init__(cfg, cl, rd, wr)
class AidaRemoteRunner(DefRunner):
def __init__(self, cfg, cl, rd, wr):
super(AidaRemoteRunner, self).__init__(cfg, cl, rd, wr)
class BabelfyRunner(DefRunner):
def __init__(self, cfg, cl, rd, wr):
super(BabelfyRunner, self).__init__(cfg, cl, rd, wr)
def _run_one(self, fn, text, cpsob):
"""
See L{DefRunner}
@note: Override since need param text passed to
L{analysis.AnnotationParser.parse} so that can get the mention
based on character offsets (the API will not return the mention,
just the offsets)
"""
print "- Running file: {}".format(fn),
pay = self.cl.create_payload(text)
print " Getting response",
res = self.cl.get_response(pay)
print " Parsing annotations"
anns = al.AnnotationParser.parse(fn, self.cl, res, cpsob, text=text)
return res, anns