-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.py
110 lines (85 loc) · 2.74 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
# @Author: lock
# @Date: 2017-12-15 18:23:22
# @Last Modified by: lock
# @Last Modified time: 2017-12-16 00:47:42
import requests
from lxml import html
import sys
import urlparse
import collections
import time
from PIL import Image
import os
import random
URL = "https://www.xxx.top" # 要爬取的网站
# 爬取的URL域名范围
URL_RULE = [
'https://www.xxx.top',
'https://xxx.top',
'http://www.xxx.top',
'http://xxx.top'
]
# 存储图片目录
CRAWL_IMAGES_DIR = 'crawl_images'
# 根据图片大小过滤,单位像素,小于这个像素的不要
WIDTH = 30
HEIGHT = 30
# 是否开启WAF防护,设置为True会放缓请求时间
WAF = False
###########################################################
# 抓取url队列
url_queue = collections.deque()
url_queue.append(URL)
# 抓取过的url
url_crawled = set()
url_crawled.add(URL)
# 抓取过的图片url
url_image = set()
# 单线程抓取
while(len(url_queue)):
print("Url queue length is %d" % len(url_queue))
url = url_queue.popleft()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get(url, headers = headers)
content_body = html.fromstring(response.content)
parsed_body = html.fromstring(response.text)
images = parsed_body.xpath('//img/@src')
images = {urlparse.urljoin(response.url, url) for url in images}
print "Found %d image in %s" % (len(images), url)
# 下载图片
for image in images - url_image:
try:
r = requests.get(image, headers = headers,timeout=10)
filename = image.split('/')[-1]
now = time.localtime(time.time())
year, month, day, hour, minute, second, weekday, yearday, daylight = now
if os.path.exists(CRAWL_IMAGES_DIR)==False:
os.makedirs(CRAWL_IMAGES_DIR)
file_path = "%s/" %(CRAWL_IMAGES_DIR,) + "%02d-%02d-%02d-" % (year, month, day) + "%02d:%02d:%02d-" % (hour, minute, second) + filename
with open(file_path,'w') as f:
f.write(r.content)
img = Image.open(file_path)
width = img.size[0]
height = img.size[1]
if width <= WIDTH or height <= HEIGHT:
os.remove(file_path)
else:
print file_path
url_image.add(image)
if WAF:
time.sleep(random.randint(1,2))
except IOError:
print "can not open image"
except Exception, e:
print e.message
# 获取网页上所有url
links = {urlparse.urljoin(response.url, url) for url in content_body.xpath('//a/@href') if urlparse.urljoin(response.url, url).startswith('http')}
for link in links - url_crawled:
for url_rule in URL_RULE:
if link.startswith(url_rule) is True:
url_crawled.add(link)
url_queue.append(link)
if WAF:
time.sleep(random.randint(1,2))
print 'all done'