-
Notifications
You must be signed in to change notification settings - Fork 0
/
sc3.py
230 lines (194 loc) · 6.9 KB
/
sc3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""
AUTHOR: Darren Nix
Version: 0.1
Date: 2011-9-7
Site: www.darrennix.com
License: Apache 2.0
Crawls a site to find unique page URLs and returns them as a list.
Ignores query strings, badly formed URLs, and links to domains
outside of the starting domain.
Inspired by sitemap_gen from Valdimir Toncar
DEPENDENCIES:
BeautifulSoup HTML parsing library
Eventlet
"""
import urlparse
from bs4 import BeautifulSoup
import cchardet as chardet
import eventlet
from eventlet.green import urllib2
import threading
import LinkInfo
from LinkInfo import LinkInfo
import socket #if lib does't support timeout arg, socket.setdefaulttimeout(30)
import traceback
import pdb
COUNT = 1
mutex = threading.Lock()
class Sitemapper():
def assignID(self,linkInfo):
global COUNT, mutex
linkInfo.setID(COUNT)
#print "id is ",COUNT
COUNT += 1
def main(self, start_url, block_extensions=['.pdf','.gif','.jpg','.JPG','.PNG','.png','.wav','.mp3','.wma'], max_urls = 100):
# Set user agent string
opener = urllib2.build_opener()
opener.addheaders = [
('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Charset', 'utf-8,gbk;q=0.7,*;q=0.3'),
#('Accept-Encoding', 'gzip,deflate,sdch'),
('Accept-Language', 'en-US,en,en-zh;q=0.8'),
#('Cache-Control', 'max-age=0'),
#('Connection', 'keep-alive')
]
urllib2.install_opener(opener)
# Get base info
(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url)
fragments = (scheme, netloc, '', '', '', '')
base_url = urlparse.urlunparse(fragments)
#print "base_url -> ", base_url
mainLink = LinkInfo(None,base_url,u'Main',0,u'first page')
self.assignID(mainLink)
urls_queue = set([mainLink])
urls_crawled = set()
urls_crawled2 = set()
pool = eventlet.GreenPool(20)
counter = 0
tmpC = 0
while True:
#Infinite loop sanity check
counter +=1
if counter > max_urls:
break
for url, body in pool.imap(self.fetch, urls_queue):
# Remove this url from the queue set
urls_queue = urls_queue - set([url])
# Add url to crawled set
urls_crawled = urls_crawled.union(set([url]))
urls_crawled2 = urls_crawled2.union(set([url]))
# Extract links
links = self.extract_links(url, body, block_extensions)
if ( links == None ):return urls_crawled
if tmpC == 100000 : return urls_crawled
tmpC += 1
for link in links:
if link not in urls_queue and link not in urls_crawled:
# Add link to queue
urls_queue = urls_queue.union(set([link]))
print u"[valid]: link -> ", link.link
return urls_crawled
def fetch(self, url):
print u"opening", url.link
try:
fails = 0
while True:
if fails >=2:return url,False
try:
page = urllib2.urlopen(url.link,timeout=30)
except:
print u'time out 30, retry ..',url.link
fails += 1
continue
break
#DO NOT USE str because it can just handle ascii code
'''When using Unicode, four points
1.string in the application should be used preffixed with u
2.Do NOT use str but unicode()
3.Do not use deprecated method string, it would strewthings up. Ah~~~
4.do not encode and decode characters in your application untill you really need to do such as when you are writing a file to the disk, database, or network. use encode and decode
'''
body = page.read()
#print chardet.detect(body)
result = chardet.detect(body)
try:
#body = body.decode(result['encoding'])
body = unicode(body,result['encoding'],'ignore')
except:
print u"decode err url:%s; body: %s" % (url.link,body)
pdb.set_trace()
traceback.print_exc()
finally:
#print "decode err url:%s; body: %s" % (url,body)
pass
#print u"body type ", type(body)
return url, body#, charset
except (urllib2.URLError, urllib2.HTTPError, Exception), detail:
# Skip this node
print u"ERROR %s" % detail
print u"current wrong decoded url is %s" % url.link
return url, False#, False
def extract_links(self, url, body, block_extensions):
print u" -- resolbing ", url.link #, u" with type(body) -> ", type(body)
'''(base_scheme, base_netloc, base_path, base_params, base_query, base_fragment) = urlparse.urlparse(url.link)
for blocked_extension in block_extensions:
if(base_path.find(blocked_extension) > 0):
print u" ----- hit---"
return
'''
if( isinstance(body,(unicode)) == False ): return []
soup = BeautifulSoup(u''.join(body))
#test code here
#print BeautifulSoup(''.join(body)).get_text()
#print BeautifulSoup((body),from_encoding="utf-8").get_text()
links = soup.findAll('a')
if (links == None):return []
good_links = []
# Loop through all the links on the page
for link in links:
# Ignore A tags without HREFs
try:
#print "partial_link_url - > ",link['href']
#partial_link_url = str(link['href'].decode('gbk').encode)
partial_link_url = link['href']
#if( link['href'].find('www.huilanyujia.com') > 5):continue
except KeyError:
continue
# Concatenate relative urls like "../joing.html" with currently being processed url
link_url = urlparse.urljoin(url.link, partial_link_url)
# Strip off any trailing jibberish like ?test=1
(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(link_url)
fragments = (scheme, netloc, path, '', '', '')
link_url = urlparse.urlunparse(fragments)
# Make sure we're still on the same domain
(base_scheme, base_netloc, base_path, base_params, base_query, base_fragment) = urlparse.urlparse(url.link)
# Skip some file types
blocked = False
for blocked_extension in block_extensions:
extension_length = len(blocked_extension)
url_extension = path[-extension_length:]
if url_extension == blocked_extension:
blocked = True
if netloc != base_netloc:
# Different domain
pass
elif blocked:
# bad extension
pass
elif len(link.text) <= 0:
# ignore url without title
pass
else:
# Add this link to the list
#print u"find link %s, [%s]" % (link_url,link.text)
#print u"find link", link_url , u"in url:",url.link
#print u" link and txt type : ",type(link), type(link.text)
try:
#print link.text
pass
except:
print "text err ",link," ;in ",url.link," ,",type(link.text)
r = chardet.detect(body)
pdb.set_trace()
traceback_print_exc()
#good_links.append(link_url)
#pdb.set_trace()
newLink = LinkInfo(url,link_url,link.text,0)
self.assignID(newLink)
#if url.parent == None:
# print u"pid = -1",u" id = ",newLink.id
#else:
# print u"pid = ",newLink.parent.id,u" id = ",newLink.id
good_links.append(newLink)
return good_links