-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.js
222 lines (189 loc) · 6.8 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env node
const path = require('path');
const fs = require('fs');
const puppeteer = require('puppeteer');
const program = require('commander');
const inquirer = require('inquirer');
const chalk = require('chalk');
const {URL} = require('url');
var command = program.command('start')
.description('爬取指定的url,保存为pdf,用来爬电子书不错')
.option('-u --url [url]', '需要爬取的url,多个url之间以'+path.delimiter+'分隔')
.option('-l --level [level]', '爬取层级,默认爬取当前层级')
.option('-t --target [target]', '文件保存目录')
.action(option => {
const promps = [];
if(!option.url){
promps.push({
type: 'input',
name: 'url',
message: '请输入爬取的url',
validate: function(input){
if(!input){
return 'url不能为空'
}
try{
new URL(input)
}catch(e){
return 'url不合法'
}
return true;
}
})
}
if(!option.query){
promps.push({
type: 'input',
name: 'query',
message: '请输入下钻元素选取器',
default:'a'
})
}
if(!option.chrome){
promps.push({
type: 'input',
name: 'chrome',
message: '请输入chrome地址'
})
}
if(!option.level){
promps.push({
type: 'input',
name: 'level',
message: '请输入爬取的层级',
default: 0,
validate: function(input){
if(!/^\d+$/.test(input)){
return '层级为数字'
}
return true;
}
})
}
if(!option.target){
promps.push({
type: 'input',
name: 'target',
message: '请输入存储目录',
default: process.cwd()
})
}
promps.push({
type: 'list',
name: 'format',
message: '保存文件格式',
default: 'pdf',
choices: ['pdf']
})
inquirer.prompt(promps).then(answers => {
start(Object.assign(option, answers))
})
})
start({
url: 'http://eggjs.org/zh-cn/tutorials/index.html',
level: 1,
format: 'pdf',
target: process.cwd(),
chrome: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
query: '#mobileAside dl dd a'
})
async function start(config){
var {urls, browser} = await init(config);
work(config, urls, browser);
}
async function work(config, urls, browser){
var flag = true;
while(flag){
var urlObj = urls.shift();
if(!urlObj){
flag = false;
browser.close();
continue;
}
if(urlObj.level > config.level){
continue;
}
var page = await browser.newPage();
try{
await page.goto(urlObj.url, {timeout: 100000});
}catch(e){
outputErrorMsg(formatMsg(urlObj, '网络请求超时'));
page.close();
continue;
}
if(!urlObj.name){
urlObj.name = await page.evaluate(() => {
return document.querySelector('title').text;
})
urlObj.filePath = path.join(urlObj.filePath, urlObj.name);
}
createPath(urlObj.filePath);
var fullPath = path.join(urlObj.filePath, encodeFileName(urlObj.name)+'.pdf');
page.pdf({path:fullPath, format:'A4'});
outputMsg(formatMsg(urlObj, '生成文件:'+fullPath));
var links = await page.evaluate((config) => {
console.log(config)
return Array.from(document.querySelectorAll(config.query)).map(function($a){
return {
url: $a.href.trim(),
name: $a.text
}
})
}, config);
page.close();
links.forEach(function(link){
urls.push({
url: link.url,
level: urlObj.level + 1,
name: link.name,
filePath: path.join(urlObj.filePath, encodeFileName(link.name))
})
})
}
}
function encodeFileName(str){
return str.replace(/[\\:*?/"<>\.|]/g, '_');
}
function validate(urlObj, errorMsgHandler){
return true;
}
function formatMsg(urlObj, msg){
return JSON.stringify({url: urlObj.url, msg: msg});
}
function outputErrorMsg(msg){
console.log(chalk.red(msg));
}
function outputMsg(msg){
console.log(chalk.white(msg));
}
async function init(config){
var launchConfig = {
headless: true
};
if(config.chrome){
launchConfig.executablePath = config.chrome;
}
var browser = await puppeteer.launch(launchConfig)
var urls = config.url.split(path.delimiter);
return {
browser: browser,
urls:urls.map(function(url){
return {
url: url,
level: 0,
name: '',
filePath: config.target
}
}
)}
}
program.parse(process.argv);
function createPath(filePath){
var sept = filePath.split(path.sep);
for(var i = 1, len = sept.length; i < len; i++){
sept[i] = path.join(sept[i-1], sept[i]);
if(!fs.existsSync(sept[i])){
fs.mkdirSync(sept[i]);
}
}
}