-
Notifications
You must be signed in to change notification settings - Fork 9
/
crawler.py
379 lines (289 loc) · 12.5 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
#!/usr/bin/env python
""" This is a modified version of James Mills' original recipe. """
import re
import sys
import time
import math
import urllib2
import urlparse
import optparse
import hashlib
from cgi import escape
from traceback import format_exc
from Queue import Queue, Empty as QueueEmpty
from bs4 import BeautifulSoup
class Link (object):
def __init__(self, src, dst, link_type):
self.src = src
self.dst = dst
self.link_type = link_type
def __hash__(self):
return hash((self.src, self.dst, self.link_type))
def __eq__(self, other):
return (self.src == other.src and
self.dst == other.dst and
self.link_type == other.link_type)
def __str__(self):
return self.src + " -> " + self.dst
class Crawler(object):
def __init__(self, root, depth_limit, confine=None, exclude=[], locked=True, filter_seen=True):
self.root = root
self.host = urlparse.urlparse(root)[1]
## Data for filters:
self.depth_limit = depth_limit # Max depth (number of hops from root)
self.locked = locked # Limit search to a single host?
self.confine_prefix=confine # Limit search to this prefix
self.exclude_prefixes=exclude; # URL prefixes NOT to visit
self.urls_seen = set() # Used to avoid putting duplicates in queue
self.urls_remembered = set() # For reporting to user
self.visited_links= set() # Used to avoid re-processing a page
self.links_remembered = set() # For reporting to user
self.num_links = 0 # Links found (and not excluded by filters)
self.num_followed = 0 # Links followed.
# Pre-visit filters: Only visit a URL if it passes these tests
self.pre_visit_filters=[self._prefix_ok,
self._exclude_ok,
self._not_visited,
self._same_host]
# Out-url filters: When examining a visited page, only process
# links where the target matches these filters.
if filter_seen:
self.out_url_filters=[self._prefix_ok,
self._same_host]
else:
self.out_url_filters=[]
def _pre_visit_url_condense(self, url):
""" Reduce (condense) URLs into some canonical form before
visiting. All occurrences of equivalent URLs are treated as
identical.
All this does is strip the \"fragment\" component from URLs,
so that http://foo.com/blah.html\#baz becomes
http://foo.com/blah.html """
base, frag = urlparse.urldefrag(url)
return base
## URL Filtering functions. These all use information from the
## state of the Crawler to evaluate whether a given URL should be
## used in some context. Return value of True indicates that the
## URL should be used.
def _prefix_ok(self, url):
"""Pass if the URL has the correct prefix, or none is specified"""
return (self.confine_prefix is None or
url.startswith(self.confine_prefix))
def _exclude_ok(self, url):
"""Pass if the URL does not match any exclude patterns"""
prefixes_ok = [ not url.startswith(p) for p in self.exclude_prefixes]
return all(prefixes_ok)
def _not_visited(self, url):
"""Pass if the URL has not already been visited"""
return (url not in self.visited_links)
def _same_host(self, url):
"""Pass if the URL is on the same host as the root URL"""
try:
host = urlparse.urlparse(url)[1]
return re.match(".*%s" % self.host, host)
except Exception, e:
print >> sys.stderr, "ERROR: Can't process url '%s' (%s)" % (url, e)
return False
def crawl(self):
""" Main function in the crawling process. Core algorithm is:
q <- starting page
while q not empty:
url <- q.get()
if url is new and suitable:
page <- fetch(url)
q.put(urls found in page)
else:
nothing
new and suitable means that we don't re-visit URLs we've seen
already fetched, and user-supplied criteria like maximum
search depth are checked. """
q = Queue()
q.put((self.root, 0))
while not q.empty():
this_url, depth = q.get()
#Non-URL-specific filter: Discard anything over depth limit
if depth > self.depth_limit:
continue
#Apply URL-based filters.
do_not_follow = [f for f in self.pre_visit_filters if not f(this_url)]
#Special-case depth 0 (starting URL)
if depth == 0 and [] != do_not_follow:
print >> sys.stderr, "Whoops! Starting URL %s rejected by the following filters:", do_not_follow
#If no filters failed (that is, all passed), process URL
if [] == do_not_follow:
try:
self.visited_links.add(this_url)
self.num_followed += 1
page = Fetcher(this_url)
page.fetch()
for link_url in [self._pre_visit_url_condense(l) for l in page.out_links()]:
if link_url not in self.urls_seen:
q.put((link_url, depth+1))
self.urls_seen.add(link_url)
do_not_remember = [f for f in self.out_url_filters if not f(link_url)]
if [] == do_not_remember:
self.num_links += 1
self.urls_remembered.add(link_url)
link = Link(this_url, link_url, "href")
if link not in self.links_remembered:
self.links_remembered.add(link)
except Exception, e:
print >>sys.stderr, "ERROR: Can't process url '%s' (%s)" % (this_url, e)
#print format_exc()
class OpaqueDataException (Exception):
def __init__(self, message, mimetype, url):
Exception.__init__(self, message)
self.mimetype=mimetype
self.url=url
class Fetcher(object):
"""The name Fetcher is a slight misnomer: This class retrieves and interprets web pages."""
def __init__(self, url):
self.url = url
self.out_urls = []
def __getitem__(self, x):
return self.out_urls[x]
def out_links(self):
return self.out_urls
#def _addHeaders(self, request):
# request.add_header("User-Agent", AGENT)
def _open(self):
url = self.url
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
return (request, handle)
def fetch(self):
request, handle = self._open()
#self._addHeaders(request)
if handle:
try:
data=handle.open(request)
mime_type=data.info().gettype()
url=data.geturl();
if mime_type != "text/html":
raise OpaqueDataException("Not interested in files of type %s" % mime_type,
mime_type, url)
content = unicode(data.read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except OpaqueDataException, error:
print >>sys.stderr, "Skipping %s, has type %s" % (error.url, error.mimetype)
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url, escape(href))
if url not in self:
self.out_urls.append(url)
def getLinks(url):
page = Fetcher(url)
page.fetch()
"""for i, url in enumerate(page):
print "%d. %s" % (i, url) """
j = 1
for i, url in enumerate(page):
if url.find("http")>=0:
print "%d. %s" % (j, url)
j = j + 1
def parse_options():
"""parse_options() -> opts, args
Parse any command-line options given returning both
the parsed options and arguments.
"""
parser = optparse.OptionParser()
parser.add_option("-q", "--quiet",
action="store_true", default=False, dest="quiet",
help="Enable quiet mode")
parser.add_option("-l", "--links",
action="store_true", default=False, dest="links",
help="Get links for specified url only")
parser.add_option("-d", "--depth",
action="store", type="int", default=30, dest="depth_limit",
help="Maximum depth to traverse")
parser.add_option("-c", "--confine",
action="store", type="string", dest="confine",
help="Confine crawl to specified prefix")
parser.add_option("-x", "--exclude", action="append", type="string",
dest="exclude", default=[], help="Exclude URLs by prefix")
parser.add_option("-L", "--show-links", action="store_true", default=False,
dest="out_links", help="Output links found")
parser.add_option("-u", "--show-urls", action="store_true", default=False,
dest="out_urls", help="Output URLs found")
parser.add_option("-D", "--dot", action="store_true", default=False,
dest="out_dot", help="Output Graphviz dot file")
opts, args = parser.parse_args()
if len(args) < 1:
parser.print_help(sys.stderr)
raise SystemExit, 1
if opts.out_links and opts.out_urls:
parser.print_help(sys.stderr)
parser.error("options -L and -u are mutually exclusive")
return opts, args
class DotWriter:
""" Formats a collection of Link objects as a Graphviz (Dot)
graph. Mostly, this means creating a node for each URL with a
name which Graphviz will accept, and declaring links between those
nodes."""
def __init__ (self):
self.node_alias = {}
def _safe_alias(self, url, silent=False):
"""Translate URLs into unique strings guaranteed to be safe as
node names in the Graphviz language. Currently, that's based
on the md5 digest, in hexadecimal."""
if url in self.node_alias:
return self.node_alias[url]
else:
m = hashlib.md5()
m.update(url)
name = "N"+m.hexdigest()
self.node_alias[url]=name
if not silent:
print "\t%s [label=\"%s\"];" % (name, url)
return name
def asDot(self, links):
""" Render a collection of Link objects as a Dot graph"""
print "digraph Crawl {"
print "\t edge [K=0.2, len=0.1];"
for l in links:
print "\t" + self._safe_alias(l.src) + " -> " + self._safe_alias(l.dst) + ";"
print "}"
def main():
opts, args = parse_options()
url = args[0]
if opts.links:
getLinks(url)
raise SystemExit, 0
depth_limit = opts.depth_limit
confine_prefix=opts.confine
exclude=opts.exclude
sTime = time.time()
print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit)
crawler = Crawler(url, depth_limit, confine_prefix, exclude)
crawler.crawl()
if opts.out_urls:
print "\n".join(crawler.urls_seen)
if opts.out_links:
print "\n".join([str(l) for l in crawler.links_remembered])
if opts.out_dot:
d = DotWriter()
d.asDot(crawler.links_remembered)
eTime = time.time()
tTime = eTime - sTime
print >> sys.stderr, "Found: %d" % crawler.num_links
print >> sys.stderr, "Followed: %d" % crawler.num_followed
print >> sys.stderr, "Stats: (%d/s after %0.2fs)" % (
int(math.ceil(float(crawler.num_links) / tTime)), tTime)
if __name__ == "__main__":
main()