-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler-pagination.js
122 lines (92 loc) · 3.75 KB
/
crawler-pagination.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
console.log("api key:", API_KEY);
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
try {
const url = `https://www.yelp.com/search?find_desc=${formattedKeyword}&find_loc=${location}&start=${pageNumber*10}`;
await page.goto(url);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-testid='serp-ia-card']");
for (const divCard of divCards) {
const cardText = await page.evaluate(element => element.textContent, divCard);
const img = await divCard.$("img");
const name = await page.evaluate(element => element.getAttribute("alt"), img);
const nameRemoved = cardText.replace(name, "");
let sponsored = isNaN(nameRemoved[0]);
let rank = 0;
if (!sponsored) {
rankString = nameRemoved.split(".");
rank = Number(rankString[0]);
}
let rating = 0.0;
const hasRating = await divCard.$("div span[data-font-weight='semibold']");
if (hasRating) {
const ratingText = await page.evaluate(element => element.textContent, hasRating);
if (ratingText.length > 0) {
rating = Number(ratingText);
}
}
let reviewCount = "0";
if (cardText.includes("review")) {
reviewCount = cardText.split("(")[1].split(")")[0].split(" ")[0];
}
const aElement = await divCard.$("a");
const link = await page.evaluate(element => element.getAttribute("href"), aElement);
const yelpUrl = `https://www.yelp.com${link.replace("https://proxy.scrapeops.io", "")}`
const searchData = {
name: name,
sponsored: sponsored,
stars: rating,
rank: rank,
review_count: reviewCount,
url: yelpUrl
}
console.log(searchData);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, pages, location, retries) {
const pageList = range(0, pages);
const browser = await puppeteer.launch()
for (const page of pageList) {
await scrapeSearchResults(browser, keyword, page, location, retries);
}
await browser.close();
}
async function main() {
const keywords = ["restaurants"];
const concurrencyLimit = 4;
const pages = 1;
const location = "uk";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, pages, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
}
main();