-
Notifications
You must be signed in to change notification settings - Fork 0
/
xiaohua_spider.py
54 lines (42 loc) · 1.37 KB
/
xiaohua_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from bs4 import BeautifulSoup
import requests
import re
import time
url = "http://www.xiaohuar.com/2014.html"
url_10 = ["http://www.xiaohuar.com/list-1-{}.html".format(str(i)) for i in range(0, 10, 1)]
filename1 = "E:/xiaohua_pictures/"
filename2 = "E:/xiaohua_pictures2/"
def collect_datas(url):
response = requests.get(url)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, 'lxml')
imgs = soup.select("div.img > a > img[width=210]")
# 将所有img的链接放在datas这个列表中
datas = []
# 匹配一些有问题的链接,以/d/file/开头
pattern = re.compile("\A/d/file/")
for img in imgs:
data = {
"title": img.get("alt"),
"img": img.get("src")
}
match = pattern.match(data["img"])
# 将一些有问题的地址补齐,
if match:
data["img"] = "http://www.xiaohuar.com" + data["img"]
else:
pass
datas.append(data)
return datas
def save_imgs(datas, filename):
for data in datas:
time.sleep(1)
r = requests.get(data["img"], stream=True)
img_name = filename + data["title"] + ".jpg"
with open(img_name, 'wb') as fd:
for chunk in r.iter_content():
fd.write(chunk)
for url in url_10:
datas = collect_datas(url)
print(datas)
save_imgs(datas, filename2)