-
Notifications
You must be signed in to change notification settings - Fork 1
/
robotexclusionrulesparser.py
599 lines (475 loc) · 24.2 KB
/
robotexclusionrulesparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
"""
A robot exclusion rules parser for Python by Philip Semanchuk
Full documentation, examples and a comparison to Python's robotparser module
reside here:
http://NikitaTheSpider.com/python/rerp/
This code is released under the Python Software Foundation License:
http://www.python.org/psf/license/
Comments, bug reports, etc. are most welcome via email to:
Simple usage examples:
import robotexclusionrulesparser
rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
try:
rerp.fetch('http://www.example.com/robots.txt')
except:
# See the documentation for expected errors
pass
if rerp.is_allowed('CrunchyFrogBot', '/foo.html'):
print "It is OK to fetch /foo.html"
OR supply the contents of robots.txt yourself:
rerp = RobotExclusionRulesParser()
s = file("robots.txt").read()
rerp.parse(s)
if rerp.is_allowed('CrunchyFrogBot', '/foo.html'):
print "It is OK to fetch /foo.html"
The function is_expired() tells you if you need to fetch a fresh copy of
this robots.txt.
if rerp.is_expired():
# Get a new copy
pass
RobotExclusionRulesParser supports __str()__ so you can print an instance to
see the its rules in robots.txt format.
The comments refer to MK1994, MK1996 and GYM2008. These are:
MK1994 = the 1994 robots.txt draft spec (http://www.robotstxt.org/orig.html)
MK1996 = the 1996 robots.txt draft spec (http://www.robotstxt.org/norobots-rfc.txt)
GYM2008 = the Google-Yahoo-Microsoft extensions announced in 2008
(http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360)
"""
import urllib
import urllib2
import urlparse
import re
import time
import email.utils
import calendar
# These are the different robots.txt syntaxes that this module understands.
# Hopefully this list will never have more than two elements.
MK1996 = 1
GYM2008 = 2
_end_of_line_regex = re.compile(r"(?:\r\n)|\r|\n")
# This regex is a little more generous than the spec because it accepts
# "User-agent" or "Useragent" (without a dash). MK1994/96 permits only the
# former. The regex also doesn't insist that "useragent" is at the exact
# beginning of the line, which makes this code immune to confusion caused
# by byte order markers.
_directive_regex = re.compile("(allow|disallow|user[-]?agent|sitemap|crawl-delay):[ \t]*(.*)", re.IGNORECASE)
# This is the number of seconds in a week that I use to determine the default
# expiration date defined in MK1996.
SEVEN_DAYS = 60 * 60 * 24 * 7
# This controls the max number of bytes read in as a robots.txt file. This
# is just a bit of defensive programming in case someone accidentally sends
# an ISO file in place of their robots.txt. (It happens...) Suggested by
# Dima Brodsky.
MAX_FILESIZE = 100 * 1024 # 100k
# Control characters are everything < 0x20 and 0x7f.
_control_characters_regex = re.compile(r"""[\000-\037]|\0177""")
# Charset extraction regex for pulling the encoding (charset) out of a
# content-type header.
_charset_extraction_regex = re.compile(r"""charset=['"]?(?P<encoding>[^'"]*)['"]?""")
def _unquote_path(path):
# MK1996 says, 'If a %xx encoded octet is encountered it is unencoded
# prior to comparison, unless it is the "/" character, which has
# special meaning in a path.'
path = re.sub("%2[fF]", "\n", path)
path = urllib.unquote(path)
return path.replace("\n", "%2F")
def _scrub_data(s):
# Data is either a path or user agent name; i.e. the data portion of a
# robots.txt line. Scrubbing it consists of (a) removing extraneous
# whitespace, (b) turning tabs into spaces (path and UA names should not
# contain tabs), and (c) stripping control characters which, like tabs,
# shouldn't be present. (See MK1996 section 3.3 "Formal Syntax".)
s = _control_characters_regex.sub("", s)
s = s.replace("\t", " ")
return s.strip()
def _parse_content_type_header(header):
media_type = ""
encoding = ""
# A typical content-type looks like this:
# text/plain; charset=UTF-8
# The portion after "text/plain" is optional and often not present.
# ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7
if header:
header = header.strip().lower()
else:
header = ""
# I use a lambda here instead of passing strip directly because I don't
# know if I'm dealing with str or unicode objects.
chunks = map(lambda s: s.strip(), header.split(";"))
media_type = chunks[0]
if len(chunks) > 1:
for parameter in chunks[1:]:
m = _charset_extraction_regex.search(parameter)
if m and m.group("encoding"):
encoding = m.group("encoding")
return media_type.strip(), encoding.strip()
class RobotExclusionRulesParser(object):
def __init__(self):
self._source_url = ""
self.user_agent = None
self.use_local_time = True
self.expiration_date = self._now() + SEVEN_DAYS
self._response_code = None
self._sitemap = None
self.__rulesets = [ ]
# source_url is read-only.
def __get_source_url(self): return self._source_url
def __set_source_url(self, foo): raise AttributeError, "source_url is read-only"
source_url = property(__get_source_url, __set_source_url)
# response_code is read-only.
def __get_response_code(self): return self._response_code
def __set_response_code(self, foo): raise AttributeError, "response_code is read-only"
response_code = property(__get_response_code, __set_response_code)
# sitemap is read-only.
def __get_sitemap(self): return self._sitemap
def __set_sitemap(self, foo): raise AttributeError, "sitemap is read-only"
sitemap = property(__get_sitemap, __set_sitemap)
def _now(self):
if self.use_local_time:
return time.time()
else:
# What the heck is timegm() doing in the calendar module?!?
return calendar.timegm(time.gmtime())
def is_expired(self):
return self.expiration_date <= self._now()
def is_allowed(self, user_agent, url, syntax=GYM2008):
# The robot rules are stored internally as Unicode. The two lines
# below ensure that the parameters passed to this function are also
# Unicode. If those lines were not present and the caller passed a
# non-Unicode user agent or URL string to this function, Python would
# silently convert it to Unicode before comparing it to the robot
# rules. Such conversions use the default encoding (usually US-ASCII)
# and if the string couldn't be converted using that encoding, Python
# would raise a UnicodeError later on in the guts of this code which
# would be confusing. Converting the strings to Unicode here doesn't
# make the problem go away but it does make the conversion explicit
# so that failures are easier to understand.
if not isinstance(user_agent, unicode): user_agent = unicode(user_agent)
if not isinstance(url, unicode): url = unicode(url)
if syntax not in (MK1996, GYM2008):
raise ValueError, "Syntax must be MK1996 or GYM2008"
for ruleset in self.__rulesets:
if ruleset.does_user_agent_match(user_agent):
return ruleset.is_url_allowed(url, syntax)
return True
def get_crawl_delay(self, user_agent):
# See is_allowed() comment about the explicit unicode conversion.
if not isinstance(user_agent, unicode): user_agent = unicode(user_agent)
for ruleset in self.__rulesets:
if ruleset.does_user_agent_match(user_agent):
return ruleset.crawl_delay
return None
def fetch(self, url):
# ISO-8859-1 is the default encoding for text files per the specs for
# HTTP 1.0 (RFC 1945 sec 3.6.1) and HTTP 1.1 (RFC 2616 sec 3.7.1).
# ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
encoding = "iso-8859-1"
content = ""
expires_header = None
content_type_header = None
self._response_code = None
self._source_url = url
if self.user_agent:
req = urllib2.Request(url, None, { 'User-Agent' : self.user_agent })
else:
req = urllib2.Request(url)
try:
f = urllib2.urlopen(req)
content = f.read(MAX_FILESIZE)
# As of Python 2.5, f.info() looks like it returns the HTTPMessage
# object created during the connection.
expires_header = f.info().getheader("expires")
content_type_header = f.info().getheader("Content-Type")
# As of Python 2.4, this file-like object reports the response
# code, too.
if hasattr(f, "code"):
self._response_code = f.code
else:
self._response_code = 200
f.close()
except urllib2.URLError, ErrorInstance:
# Not all errors return an HTTP response code.
if hasattr(ErrorInstance, "code"):
self._response_code = ErrorInstance.code
# MK1996 section 3.4 says, "...robots should take note of Expires
# header set by the origin server. If no cache-control directives
# are present robots should default to an expiry of 7 days".
# This code is lazy and looks at the Expires header but not
# Cache-Control directives.
self.expiration_date = None
if self._response_code >= 200 and self._response_code < 300:
# All's well.
if expires_header:
self.expiration_date = email.utils.parsedate_tz(expires_header)
if self.expiration_date:
# About time zones -- the call to parsedate_tz() returns a
# 10-tuple with the time zone offset in the 10th element.
# There are 3 valid formats for HTTP dates, and one of
# them doesn't contain time zone information. (UTC is
# implied since all HTTP header dates are UTC.) When given
# a date that lacks time zone information, parsedate_tz()
# returns None in the 10th element. mktime_tz() interprets
# None in the 10th (time zone) element to mean that the
# date is *local* time, not UTC.
# Therefore, if the HTTP timestamp lacks time zone info
# and I run that timestamp through parsedate_tz() and pass
# it directly to mktime_tz(), I'll get back a local
# timestamp which isn't what I want. To fix this, I simply
# convert a time zone of None to zero. It's much more
# difficult to explain than to fix. =)
# ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
if self.expiration_date[9] == None:
self.expiration_date = self.expiration_date[:9] + (0,)
self.expiration_date = email.utils.mktime_tz(self.expiration_date)
if self.use_local_time:
# I have to do a little more converting to get this
# UTC timestamp into localtime.
self.expiration_date = time.mktime(time.gmtime(self.expiration_date))
#else:
# The expires header was garbage.
if not self.expiration_date: self.expiration_date = self._now() + SEVEN_DAYS
if (self._response_code >= 200) and (self._response_code < 300):
# All's well.
media_type, encoding = _parse_content_type_header(content_type_header)
# RFC 2616 sec 3.7.1 --
# When no explicit charset parameter is provided by the sender,
# media subtypes of the "text" type are defined to have a default
# charset value of "ISO-8859-1" when received via HTTP.
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
if not encoding:
encoding = "iso-8859-1"
elif self._response_code in (401, 403):
# 401 or 403 ==> Go away or I will taunt you a second time!
# (according to MK1996)
content = "User-agent: *\nDisallow: /\n"
elif self._response_code == 404:
# No robots.txt ==> everyone's welcome
content = ""
else:
# Uh-oh. I punt this up to the caller.
raise urllib2.URLError(self._response_code)
# Unicode decoding errors are another point of failure that I punt
# up to the caller.
try:
content = unicode(content, encoding)
except UnicodeError:
raise UnicodeError("Robots.txt contents are not in the encoding expected (%s)." % encoding)
except (LookupError, ValueError):
# LookupError ==> Python doesn't have a decoder for that encoding.
# One can also get a ValueError here if the encoding starts with
# a dot (ASCII 0x2e). See Python bug 1446043 for details. This
# bug was supposedly fixed in Python 2.5.
raise UnicodeError("I don't understand the encoding \"%s\"." % encoding)
# Now that I've fetched the content and turned it into Unicode, I
# can parse it.
self.parse(content)
def parse(self, s):
self._sitemap = None
self.__rulesets = [ ]
if not isinstance(s, unicode): s = unicode(s, "iso-8859-1")
# Normalize newlines.
s = _end_of_line_regex.sub("\n", s)
lines = s.split("\n")
previous_line_was_a_user_agent = False
current_ruleset = None
for line in lines:
line = line.strip()
if line and line[0] == '#':
# "Lines containing only a comment are discarded completely,
# and therefore do not indicate a record boundary." (MK1994)
pass
else:
# Remove comments
i = line.find("#")
if i != -1: line = line[:i]
line = line.strip()
if not line:
# An empty line indicates the end of a ruleset.
if current_ruleset and current_ruleset.is_not_empty():
self.__rulesets.append(current_ruleset)
current_ruleset = None
previous_line_was_a_user_agent = False
else:
# Each non-empty line falls into one of six categories:
# 1) User-agent: blah blah blah
# 2) Disallow: blah blah blah
# 3) Allow: blah blah blah
# 4) Crawl-delay: blah blah blah
# 5) Sitemap: blah blah blah
# 6) Everything else
# 1 - 5 are interesting and I find them with the regex
# below. Category 6 I discard as directed by the MK1994
# ("Unrecognised headers are ignored.")
# Note that 4 & 5 are specific to GYM2008 syntax, but
# respecting them here is not a problem. They're just
# additional information the the caller is free to ignore.
matches = _directive_regex.findall(line)
# Categories 1 - 5 produce two matches, #6 produces none.
if matches:
field, data = matches[0]
field = field.lower()
data = _scrub_data(data)
# Matching "useragent" is a deviation from the
# MK1994/96 which permits only "user-agent".
if field in ("useragent", "user-agent"):
if previous_line_was_a_user_agent:
# Add this UA to the current ruleset
if current_ruleset and data:
current_ruleset.add_robot_name(data)
else:
# Save the current ruleset and start a new one.
if current_ruleset and current_ruleset.is_not_empty():
self.__rulesets.append(current_ruleset)
#else:
# (is_not_empty() == False) ==> malformed
# robots.txt listed a UA line but provided
# no name or didn't provide any rules
# for a named UA.
current_ruleset = _Ruleset()
if data:
current_ruleset.add_robot_name(data)
previous_line_was_a_user_agent = True
elif field == "allow":
previous_line_was_a_user_agent = False
if current_ruleset:
current_ruleset.add_allow_rule(data)
elif field == "sitemap":
previous_line_was_a_user_agent = False
self._sitemap = data
elif field == "crawl-delay":
# Only Yahoo documents the syntax for Crawl-delay.
# ref: http://help.yahoo.com/l/us/yahoo/search/webcrawler/slurp-03.html
previous_line_was_a_user_agent = False
if current_ruleset:
try:
current_ruleset.crawl_delay = float(data)
except ValueError:
# Invalid crawl-delay -- ignore.
pass
else:
# This is a disallow line
previous_line_was_a_user_agent = False
if current_ruleset:
current_ruleset.add_disallow_rule(data)
if current_ruleset and current_ruleset.is_not_empty():
self.__rulesets.append(current_ruleset)
# Now that I have all the rulesets, I want to order them in a way
# that makes comparisons easier later. Specifically, any ruleset that
# contains the default user agent '*' should go at the end of the list
# so that I only apply the default as a last resort. According to
# MK1994/96, there should only be one ruleset that specifies * as the
# user-agent, but you know how these things go.
not_defaults = filter(lambda ruleset: not ruleset.is_default(), self.__rulesets) or [ ]
defaults = filter(lambda ruleset: ruleset.is_default(), self.__rulesets) or [ ]
self.__rulesets = not_defaults + defaults
def __str__(self):
if self._sitemap:
s = "Sitemap: %s\n\n" % self._sitemap
else:
s = ""
return s + '\n'.join( [str(ruleset) for ruleset in self.__rulesets] )
class _Ruleset(object):
""" _Ruleset represents a set of allow/disallow rules (and possibly a
crawl delay) that apply to a set of user agents.
Users of this module don't need this class. It's available at the module
level only because RobotExclusionRulesParser() instances can't be
pickled if _Ruleset isn't visible a the module level.
"""
ALLOW = 1
DISALLOW = 2
def __init__(self):
self.robot_names = [ ]
self.rules = [ ]
self.crawl_delay = None
def __str__(self):
d = { self.ALLOW : "Allow", self.DISALLOW : "Disallow" }
s = ''.join( ["User-agent: %s\n" % name for name in self.robot_names] )
if self.crawl_delay:
s += "Crawl-delay: %s\n" % self.crawl_delay
s += ''.join( ["%s: %s\n" % (d[rule_type], path) for rule_type, path in self.rules] )
return s.encode("utf-8")
def add_robot_name(self, bot):
self.robot_names.append(bot)
def add_allow_rule(self, path):
self.rules.append((self.ALLOW, _unquote_path(path)))
def add_disallow_rule(self, path):
self.rules.append((self.DISALLOW, _unquote_path(path)))
def is_not_empty(self):
return bool(len(self.rules)) and bool(len(self.robot_names))
def is_default(self):
return bool('*' in self.robot_names)
def does_user_agent_match(self, user_agent):
match = False
for robot_name in self.robot_names:
# MK1994 says, "A case insensitive substring match of the name
# without version information is recommended." MK1996 3.2.1
# states it even more strongly: "The robot must obey the first
# record in /robots.txt that contains a User-Agent line whose
# value contains the name token of the robot as a substring.
# The name comparisons are case-insensitive."
match = match or (robot_name == '*') or \
(robot_name.lower() in user_agent.lower())
return match
def is_url_allowed(self, url, syntax=GYM2008):
allowed = True
# Schemes and host names are not part of the robots.txt protocol,
# so I ignore them. It is the caller's responsibility to make
# sure they match.
scheme, host, path, parameters, query, fragment = urlparse.urlparse(url)
url = urlparse.urlunparse(("", "", path, parameters, query, fragment))
url = _unquote_path(url)
done = False
i = 0
while not done:
rule_type, path = self.rules[i]
if (syntax == GYM2008) and ("*" in path or path.endswith("$")):
# GYM2008-specific syntax applies here
# http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360
if path.endswith("$"):
appendix = "$"
path = path[:-1]
else:
appendix = ""
parts = path.split("*")
pattern = "%s%s" % \
(".*".join([re.escape(p) for p in parts]), appendix)
if re.match(pattern, url):
# Ding!
done = True
allowed = (rule_type == self.ALLOW)
else:
# Wildcards are either not present or are taken literally.
if url.startswith(path):
# Ding!
done = True
allowed = (rule_type == self.ALLOW)
# A blank path means "nothing", so that effectively
# negates the value above.
# e.g. "Disallow: " means allow everything
if not path:
allowed = not allowed
i += 1
if i == len(self.rules):
done = True
return allowed
class RobotFileParserLookalike(RobotExclusionRulesParser):
def __init__(self, url = ""):
RobotExclusionRulesParser.__init__(self)
self._user_provided_url = ""
self.last_checked = None
self.set_url(url)
def set_url(self, url):
# I don't want to stuff this into self._source_url because
# _source_url is set only as a side effect of calling fetch().
self._user_provided_url = url
def read(self):
RobotExclusionRulesParser.fetch(self, self._user_provided_url)
def parse(self, lines):
RobotExclusionRulesParser.parse(self, ''.join(lines))
def can_fetch(self, user_agent, url, syntax=GYM2008):
return RobotExclusionRulesParser.is_allowed(self, user_agent, url, syntax)
def mtime(self):
return self.last_checked
def modified(self):
self.last_checked = time.time()