-
Notifications
You must be signed in to change notification settings - Fork 1
/
RobotsFetcher.py
69 lines (60 loc) · 2.28 KB
/
RobotsFetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import robotparser
from urlparse import urlparse
import pickle
import os
import pycurl
import StringIO
from robotexclusionrulesparser import RobotExclusionRulesParser
class RobotsFetcher:
@staticmethod
def can_fetch(url):
domain = RobotsFetcher.extract_domain(url)
robots_parser = RobotsFetcher.get_robots(domain);
return robots_parser.is_allowed("*", url)
@staticmethod
def extract_domain(url):
o = urlparse(url)
return o.netloc
@staticmethod
def get_robots(url):
robots_directory = 'robots'
robots_file_path = robots_directory+'/'+url
if os.path.isfile(robots_file_path):
robots_file = open(robots_file_path,"rb")
# robots_parser = RobotExclusionRulesParser()
# robots_parser.parse(content)
robots_parser = pickle.load(robots_file)
else:
buffer = StringIO.StringIO()
c = pycurl.Curl()
c.setopt(c.URL, 'http://'+url+'/robots.txt')
c.setopt(c.REFERER,'')
c.setopt(c.USERAGENT,'Curl')
c.setopt(c.FOLLOWLOCATION, 1)
c.setopt(c.WRITEFUNCTION, buffer.write)
try:
c.perform()
except pycurl.error, e:
print "Error code: ", e[0]
print "Error message: ", e[1]
c.close()
robots_parser = RobotExclusionRulesParser()
robots_parser.parse('')
return robots_parser
c.close()
# print buffer.getvalue()
robots_parser = RobotExclusionRulesParser()
robots_parser.parse(buffer.getvalue())
robots_file = open(robots_file_path,"wb")
pickle.dump(robots_parser, robots_file)
robots_file.close()
return robots_parser
#robot_fetcher = RobotsFetcher()
#print robot_fetcher.can_fetch('http://www.googlediscovery.com/feed/');
#print robot_fetcher.can_fetch('http://www.googlediscovery.com/feeds/');
#print robot_fetcher.can_fetch('http://www.googlediscovery.com/trackback/');
#rp = robotparser.RobotFileParser()
#rp.set_url("http://www.googlediscovery.com/robots.txt")
#rp.read()
#print rp.can_fetch("*", "http://www.googlediscovery.com/")
#print rp.can_fetch("*", "http://www.googlediscovery.com/wp-admin/")