-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler-storage.js
142 lines (109 loc) · 4.3 KB
/
crawler-storage.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
console.log("api key:", API_KEY);
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
try {
const url = `https://www.yelp.com/search?find_desc=${formattedKeyword}&find_loc=${location}&start=${pageNumber*10}`;
await page.goto(url);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-testid='serp-ia-card']");
for (const divCard of divCards) {
const cardText = await page.evaluate(element => element.textContent, divCard);
const img = await divCard.$("img");
const name = await page.evaluate(element => element.getAttribute("alt"), img);
const nameRemoved = cardText.replace(name, "");
let sponsored = isNaN(nameRemoved[0]);
let rank = 0;
if (!sponsored) {
rankString = nameRemoved.split(".");
rank = Number(rankString[0]);
}
let rating = 0.0;
const hasRating = await divCard.$("div span[data-font-weight='semibold']");
if (hasRating) {
const ratingText = await page.evaluate(element => element.textContent, hasRating);
if (ratingText.length > 0) {
rating = Number(ratingText);
}
}
let reviewCount = "0";
if (cardText.includes("review")) {
reviewCount = cardText.split("(")[1].split(")")[0].split(" ")[0];
}
const aElement = await divCard.$("a");
const link = await page.evaluate(element => element.getAttribute("href"), aElement);
const yelpUrl = `https://www.yelp.com${link.replace("https://proxy.scrapeops.io", "")}`
const searchData = {
name: name,
sponsored: sponsored,
stars: rating,
rank: rank,
review_count: reviewCount,
url: yelpUrl
}
await writeToCsv([searchData], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, pages, location, retries) {
const pageList = range(0, pages);
const browser = await puppeteer.launch()
for (const page of pageList) {
await scrapeSearchResults(browser, keyword, page, location, retries);
}
await browser.close();
}
async function main() {
const keywords = ["restaurants"];
const concurrencyLimit = 4;
const pages = 1;
const location = "uk";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, pages, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
}
main();