-
Notifications
You must be signed in to change notification settings - Fork 0
/
cap_crawl.py
executable file
·671 lines (531 loc) · 19 KB
/
cap_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
#!/usr/bin/python2.4
#
# Copyright 2009 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Crawl/index CAP feeds.
Contains the crawl/index code, accessed at the /crawl URL. Visits selected
websites and fetches CAP data, which it stores in Datastore. Data can be
subsequently queried with the cap_query module.
There are two main concrete classes, CrawlControllerWorker and
CrawlControllerMaster, that implement the distinct behavior of the worker and
master threads, respectively.
The master thread (at /crawl) is responsible for reading the list of feeds
(from the cap_schema.Feed Datastore) and seeding the task queues. The task
queues persist in cap_schema.CrawlShard models in the Datastore. Each
CrawlShard instance is a single atom of work, which is a single URL to visit.
The URL may be CAP or an index (RSS or ATOM) of CAP's.
The worker thread (at /crawlworker) is provided with a single shard of work
consisting of a single URL to fetch. If an index of CAP URL's is retrieved,
it may push shards onto the task queue.
A thread is implemented not using Python threads (which are disallowed in
AppEngine), but by servicing a unique request under the /crawl URL. Those
requests are initiated by cron (see cron.yaml) and by the Task Queue API (see
queue.yaml).
"""
__author__ = '[email protected] (Matt Frantz)'
import datetime
import logging
import os
import random
import sys
import traceback
try:
# google3
import google3
import cap as caplib
from google3.apphosting.api import urlfetch
from google3.apphosting.api import taskqueue
from google3.apphosting.ext import db
from google3.apphosting.ext import webapp
from google3.apphosting.ext.webapp.util import run_wsgi_app
from google3.apphosting.runtime.apiproxy_errors import DeadlineExceededError
from google3.pyglib import logging
from google3.dotorg.gongo.appengine_cap2kml import cap_fake
from google3.dotorg.gongo.appengine_cap2kml import cap_index_parse
from google3.dotorg.gongo.appengine_cap2kml import cap_parse_db
from google3.dotorg.gongo.appengine_cap2kml import cap_parse_mem
from google3.dotorg.gongo.appengine_cap2kml import cap_schema
from google3.dotorg.gongo.appengine_cap2kml import db_util
from google3.dotorg.gongo.appengine_cap2kml import webapp_util
from google3.dotorg.gongo.appengine_cap2kml import xml_util
except ImportError:
import cap as caplib
from google.appengine.api import urlfetch
from google.appengine.api.labs import taskqueue
from google.appengine.ext import db
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.runtime import DeadlineExceededError
import cap_fake
import cap_index_parse
import cap_parse_db
import cap_parse_mem
import cap_schema
import db_util
import webapp_util
import xml_util
class Error(Exception):
pass
class UrlAlreadyCrawledError(Error):
pass
# Format string for an ATOM index of links.
_ATOM_INDEX = """<?xml version="1.0" encoding="UTF-8"?>
<atom:feed xmlns:atom = "http://www.w3.org/2005/Atom">
%(entries)s
</atom:feed>
"""
# Format string for a single ATOM entry with a link.
_ATOM_ENTRY = """<atom:entry><atom:link href="%(link)s" /></atom:entry>"""
def _MakeAtomFeed(urls):
"""Returns ATOM that points to a list of URL's.
Args:
urls: List of URL's (list of str)
Returns:
ATOM XML (str)
"""
entries = '\n '.join([_ATOM_ENTRY % dict(link=x) for x in urls])
return _ATOM_INDEX % locals()
# Max out the deadline for fetching URL's, according to the doc:
# http://code.google.com/appengine/docs/python/urlfetch/fetchfunction.html
URLFETCH_DEADLINE_SECS = 10
def FetchUrl(url):
"""Returns the contents of the URL, allowing for 'testdata/' local files.
Also supports fake feed URL's defined in cap_fake.FAKE_FEED_URLS.
Args:
url: URL, possibly beginning with 'testdata/' (string)
Returns:
Contents of the URL (string)
"""
if url in cap_fake.FAKE_FEED_URLS:
return _MakeAtomFeed(cap_fake.FAKE_FEED_URLS[url]())
elif url.startswith('testdata/'):
path = os.path.join(os.path.dirname(__file__), url)
try:
fd = open(path, mode='r')
try:
data = fd.read()
return data
finally:
fd.close()
except DeadlineExceededError:
raise
except:
logging.error('Cannot read file: %s', path)
raise
else:
return urlfetch.fetch(url, deadline=URLFETCH_DEADLINE_SECS).content
def GetCap(feed, crawl, cap_url):
"""Retrieves a CAP file and saves it in the Datastore.
Args:
feed: Feed object or reference
crawl: cap_schema.Crawl object
cap_url: URL of a CAP file (string)
Returns:
cap_schema.CapAlert object (already populated and saved)
"""
cap_text = xml_util.ParseText(FetchUrl(cap_url))
parser = cap_parse_mem.MemoryCapParser()
new_alert_model = lambda: caplib.Alert()
alert_mem, errors = parser.MakeAlert(new_alert_model, cap_text)
alert_db = cap_parse_db.MakeDbAlertFromMem(alert_mem)
alert_db.crawl = crawl
alert_db.feed = feed
alert_db.url = cap_url
alert_db.text = cap_text
for error in errors:
alert_db.parse_errors.append(xml_util.ParseText(str(error)))
# Save the alert model to the db.
alert_db.put()
return alert_db
def GetFeedIndex(feed_url):
"""Returns the list of CAP URL's in the current feed's index.
Args:
feed_url: URL of the feed (str)
Returns:
List of CAP URL's (strings)
Raises:
CapIndexFormatError: if there is a problem parsing.
Exception: if problems fetching the URL.
"""
index_text = FetchUrl(feed_url)
return cap_index_parse.ParseCapIndex(index_text)
def _CrawlShardKeyName(crawl, url):
"""Generates a stable, unique key name for a URL in a particular crawl.
Args:
crawl: cap_schema.Crawl object
url: URL that will be crawled (str)
Returns:
Key name (str) for the cap_schema.CrawlShard model.
"""
return 'CrawlShard %s %s' % (crawl.started, url)
# Name of the task queue for /crawlpush, which corresponds to the
# configuration in queue.yaml.
PUSH_TASKQUEUE_NAME = 'crawlpush'
def _EnqueuePush(crawl, feed, url):
"""Adds a shard to the crawl task queue.
Args:
crawl: cap_schema.Crawl object
feed: cap_schema.Feed object
url: URL to be fetched (str)
"""
queue = taskqueue.Queue(name=PUSH_TASKQUEUE_NAME)
task = taskqueue.Task(
url='/crawlpush', method='GET',
params={'crawl': str(crawl.key()), 'feed': str(feed.key()), 'url': url})
queue.add(task)
def _MaybePushShardUnsafe(crawl, feed, url):
"""Pushes a shard of work onto an appropriate queue.
Called by _MaybePushShard to wrap in a transaction.
Args:
crawl: cap_schema.Crawl object
feed: cap_schema.Feed object
url: URL to be fetched (str)
Returns:
(shard, is_new)
shard: cap_schema.CrawlShard object, possibly new.
is_new: If True, shard was just added.
Postconditions:
May add a new CrawlShard.
"""
logging.debug('MaybePushShard %r', url)
assert crawl
# Use a key that will help us know whether this URL has been crawled on
# this crawl before.
key_name = _CrawlShardKeyName(crawl, url)
# See if we already exist.
shard = cap_schema.CrawlShard.get_by_key_name(key_name)
is_new = not shard
if is_new:
shard = cap_schema.CrawlShard(
key_name=key_name, crawl=crawl, feed=feed, url=url)
shard.put()
return (shard, is_new)
def _MaybePushShard(crawl, feed, url):
"""Pushes a shard of work onto an appropriate queue.
Args:
crawl: cap_schema.Crawl object
feed: cap_schema.Feed object
url: URL to be fetched (str)
Returns:
(shard, is_new)
shard: cap_schema.CrawlShard object, possibly new.
is_new: If True, shard was just added.
Postconditions:
May add a new CrawlShard.
"""
shard, is_new = db.run_in_transaction(_MaybePushShardUnsafe, crawl, feed, url)
# Add the shard to the task queue. Avoid enqueueing completed shards.
if is_new:
_EnqueueShard(str(shard.key()))
return shard, is_new
# Name of the task queue for /crawlworker, which corresponds to the
# configuration in queue.yaml.
WORKER_TASKQUEUE_NAME = 'crawlworker'
def _EnqueueShard(shard_key):
"""Adds a shard to the crawl task queue.
Args:
shard_key: Encoded key for the cap_schema.CrawlShard model (str).
"""
queue = taskqueue.Queue(name=WORKER_TASKQUEUE_NAME)
task = taskqueue.Task(url='/crawlworker', method='GET',
params={'shard': shard_key})
queue.add(task)
class CrawlControllerWorker(object):
"""Controls a single crawl worker.
The worker performs one shard of work, recording the results in the
cap_schema.CrawlShard model. It may create and enqueue other shards of
work.
"""
def __init__(self, shard_key, _now=datetime.datetime.now):
"""Initializes a CrawlControllerWorker object.
Args:
shard_key: Encoded key of the cap_schema.CrawlShard (str or unicode)
_now: dependency injection of clock function that returns
datetime.datetime object.
"""
self._now = _now
self._shard = cap_schema.CrawlShard.get(db.Key(shard_key))
if not self._shard:
logging.error('No shard having key %r', shard_key)
def GetShard(self):
"""Returns the model of the current crawl shard.
Returns:
cap_schema.CrawlShard object.
"""
return self._shard
def DoShard(self):
"""Processes a single unit of work (a URL).
Postconditions:
Updates shard.started and shard.completed.
May update the CapAlert Datastore.
May add more CrawlShards (if nested index)
Updates shard.error, if an error occurs.
May push a new feed into the queue, affecting _NextFeed.
"""
shard = self._shard
if not shard:
return
# Avoid fetching the same shard twice. This shouldn't happen because of
# the way _MaybePushShard works.
if shard.is_done:
logging.error('Shard is already done: %r', shard.url)
return
shard.started = self._now()
url = shard.url
# Catch any errors for this URL, so that we can record them in the shard
# record.
try:
# Assume the URL contains CAP, but catch NotCapError if it is not.
try:
cap = GetCap(shard.feed, shard.crawl, url)
shard.parse_errors = cap.parse_errors
logging.debug('Created CAP for %r', url)
except cap_parse_mem.NotCapError:
logging.debug('Not CAP ... assuming CAP index: %r', url)
# Try to process the URL as an index.
urls = GetFeedIndex(url)
logging.debug('Found %d URLs in nested index', len(urls))
for url in urls:
_EnqueuePush(shard.crawl, shard.feed, url)
except (DeadlineExceededError, AssertionError):
raise
except Exception, e:
logging.error('Skipping URL %r: %r', url, e)
shard.error = db.Text('Skipping URL %r: %r: %s' %
(url, e, traceback.format_exc()))
# Whether or not we were successful, we are done with this shard.
shard.is_done = True
shard.finished = self._now()
shard.put()
class CrawlControllerMaster(object):
"""Controls the crawl master.
The master is responsible for starting a new crawl and for marking crawls as
done.
"""
def __init__(self, _crawl=None, _now=datetime.datetime.now):
"""Initializes a CrawlControllerWorker object.
Args:
_crawl: dependency injection of the crawl object (cap_schema.Crawl)
_now: dependency injection of clock function that returns
datetime.datetime object.
"""
self._now = _now
if _crawl:
self._crawl = _crawl
else:
self._EnsureCrawl()
def GetCrawl(self):
"""Returns the model of the current crawl.
Returns:
cap_schema.Crawl object, or None if there is no crawl in progress.
"""
return self._crawl
def _EnsureCrawl(self):
"""Retrieves the crawl in progress.
As master, we create a new one if necessary. If there is an existing
crawl that is done, we mark it so.
Postconditions:
self._crawl is assigned cap_schema.Crawl object, or None if there is
nothing to crawl.
"""
self._crawl = self._CrawlInProgress()
if self._crawl:
# There is a crawl in progress, so check if there are any more shards.
if self._AreMoreShardsInProgress():
logging.debug('Waiting for other workers')
return
else:
# Mark this crawl done before starting a new one.
self._CrawlIsDone()
else:
# We're the master, so start a new crawl.
self._NewCrawl()
def _AreMoreShardsInProgress(self):
"""Determines if any worker has pending work.
Returns:
True, iff there are incomplete shards for the current crawl.
"""
return bool(self._QueryAnyIncompleteShard())
def _QueryAnyIncompleteShard(self):
"""Finds an incomplete shard for any worker in the current crawl.
Returns:
db.Key object for cap_schema.CrawlShard object, or None if there are no
incomplete shards.
"""
crawl = self._crawl
assert crawl
shard = db.GqlQuery(
'SELECT __key__ FROM CrawlShard WHERE crawl = :1 AND is_done = :2',
crawl, False).get()
return shard
def _CrawlInProgress(self):
"""Returns the most recent Crawl that is still in progress.
Returns:
cap_schema.Crawl object, or None if all existing crawls are complete.
"""
query = cap_schema.Crawl.gql(
'WHERE is_done = :1 ORDER BY started DESC', False)
crawl = query.get()
if crawl:
# We have an unfinished crawl, so use it.
logging.debug('Found crawl in progress (started %s)', crawl.started)
return crawl
else:
return None
def _CrawlIsDone(self):
"""Called when we detect that the current crawl is complete.
Postconditions:
Current Crawl.is_done is set to True.
Current Crawl.finished is set to now.
Current Crawl is saved.
All relevant Feed.last_crawl references are set to the current crawl.
"""
logging.debug('Crawl is done')
# Update the Datastore.
crawl = self._crawl
assert crawl
crawl.is_done = True
crawl.finished = self._now()
crawl.put()
# Update the feeds with the last completed crawl.
for feed in cap_schema.Feed.gql('WHERE url IN :1', crawl.feed_urls):
feed.last_crawl = crawl
feed.put()
def _NewCrawl(self):
"""Starts a new crawl.
Postconditions:
self._crawl is assigned cap_schema.Crawl object, or None if there is
nothing to crawl.
"""
# Figure out what feeds we'll be crawling (if any).
crawl_started = self._now()
feeds = self._GetFeeds(crawl_started)
if not feeds:
logging.debug('No feeds to crawl')
self._crawl = None
return
# Start a new crawl.
crawl = cap_schema.Crawl()
self._crawl = crawl
crawl.started = crawl_started
crawl.feed_urls = [feed.url for feed in feeds]
crawl.put()
logging.debug('Started new crawl at %s', crawl.started)
# Create one shard per Feed.
for feed in feeds:
_EnqueuePush(crawl, feed, feed.url)
def _GetFeeds(self, crawl_started):
"""Returns a list of all feeds to be crawled.
The crawl period for each feed is considered, relative to the specified
start time of the current crawl.
Args:
crawl_started: Timestamp when the current crawl started (datetime)
Returns:
List of cap_schema.Feed objects
"""
query = cap_schema.Feed.gql(
'WHERE is_crawlable = :1 AND is_root = :2', True, True)
feeds = []
for feed in query:
last_crawl = db_util.SafelyDereference(feed, 'last_crawl')
if last_crawl:
last_crawled = last_crawl.started
else:
last_crawled = None
# If we've never crawled this feed, or we haven't crawled it recently,
# add it to the list.
if not last_crawled or crawl_started > last_crawled + datetime.timedelta(
minutes=feed.crawl_period_in_minutes):
feeds.append(feed)
return feeds
def CrawlNudge(_now=datetime.datetime.now):
"""Seeds the task queues for a crawl.
Args:
_now: dependency injection of clock function that returns
datetime.datetime object.
Returns:
CrawlControllerMaster object
"""
controller = CrawlControllerMaster(_now=_now)
return controller
class CrawlNudgeHandler(webapp.RequestHandler):
"""Responds to requests from cron to initiate a new crawl."""
def get(self):
controller = CrawlNudge()
# We're done! If we have a crawl record, wrap it in a ShadowCrawl to
# provide the Django template access to extended functionality.
crawl = controller.GetCrawl()
if crawl:
crawl = cap_schema.ShadowCrawl(crawl)
webapp_util.WriteTemplate(self.response, 'crawl.html', dict(crawl=crawl))
def CrawlPush(crawl_key, feed_key, url):
"""Decodes the keys and maybe pushes a shard of work.
Args:
crawl_key: Key (unicode or str) of a cap_schema.Crawl
feed_key: Key (unicode or str) of a cap_schema.Feed
url: URL that needs to be crawled (unicode or str)
Returns:
(shard, is_new)
shard: cap_schema.CrawlShard object, possibly new.
is_new: If True, shard was just added.
"""
crawl = cap_schema.Crawl.get(db.Key(crawl_key))
if not crawl:
logging.error('No crawl having key %r', crawl_key)
return
feed = cap_schema.Feed.get(db.Key(feed_key))
if not feed:
logging.error('No feed having key %r', feed_key)
return
return _MaybePushShard(crawl, feed, url)
class CrawlPushHandler(webapp.RequestHandler):
"""Responds to requests to push a URL into the crawl queue."""
def get(self):
crawl_key = self.request.get('crawl')
feed_key = self.request.get('feed')
url = self.request.get('url')
if crawl_key and feed_key and url:
CrawlPush(crawl_key, feed_key, url)
else:
logging.error('Must specify crawl, feed, and url')
def CrawlWorker(shard_key, _now=datetime.datetime.now):
"""Seeds the task queues for a crawl.
Args:
shard_key: Key of a cap_schema.CrawlShard
_now: dependency injection of clock function that returns
datetime.datetime object.
Returns:
CrawlControllerWorker object
"""
controller = CrawlControllerWorker(shard_key, _now=_now)
controller.DoShard()
return controller
class CrawlWorkerHandler(webapp.RequestHandler):
"""Responds to requests from the task queue to crawl a URL."""
def get(self):
shard_key = self.request.get('shard')
if shard_key:
controller = CrawlWorker(shard_key)
else:
logging.error('Must specify shard')
application = webapp.WSGIApplication(
[('/crawl', CrawlNudgeHandler),
('/crawlpush', CrawlPushHandler),
('/crawlworker', CrawlWorkerHandler),
],
debug=True)
def main():
run_wsgi_app(application)
if __name__ == '__main__':
main()