forked from edwardlanto/node-scraper-amazon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pageScraper.js
93 lines (82 loc) · 2.97 KB
/
pageScraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
const fs = require('fs')
const scraperObject = {
url:
"https://www.amazon.ca/b/?_encoding=UTF8&node=6646928011&bbn=2206275011&ref_=Oct_s9_apbd_odnav_hd_bw_b2QVmV9_0&pf_rd_r=93MDVBSQ6G1NXBMGGH8M&pf_rd_p=e06c3dcb-a950-537a-a8b3-c08632afe364&pf_rd_s=merchandised-search-10&pf_rd_t=BROWSE&pf_rd_i=2224025011",
async scraper(browser) {
let page = await browser.newPage();
let scrapedDataArr = [];
console.log(`Navigating to ${this.url}...`);
// Navigate to the selected page
await page.goto(this.url, {
waitUntil: "networkidle0",
});
await page.waitForSelector(".a-unordered-list");
let urls = await page.$$eval(".a-list-item > .a-section", (links) => {
// Extract the links from the data
links = links.map((el) => el.querySelector("a").href);
return links;
});
// Loop through each of those links, open a new page instance and get the relevant data from them
let pagePromise = (link) =>
new Promise(async (resolve, reject) => {
// Create object to structure data
let dataObj = {};
let newPage = await browser.newPage();
await newPage.goto(link, {
// Fix race condition. Wait until DOM has rendered
waitUntil: "networkidle0",
});
dataObj.title = await newPage.$eval("#productTitle", (text) =>
text.textContent.replace(/(\r\n\t|\n|\r|\t)/gm, "")
);
dataObj.brand = await newPage.$eval(
"#bylineInfo",
(text) => text.textContent
);
await newPage.waitForSelector("#centerCol");
let descriptions = await newPage.$$eval(
"#feature-bullets li",
(links) => {
let descriptionArr =[]
links = links.map((el, i) => {
descriptionArr.push(el
.querySelector("span")
.textContent.replace(/(\r\n\t|\n|\r|\t)/gm, "")
.trim())
});
return descriptionArr;
}
);
let imgUrls = await newPage.$$(".imageThumbnail");
let imgArr = [];
// Loop through all img thumbnails to get main img
for (let i = 0; i < imgUrls.length; i++) {
await imgUrls[i].hover();
await newPage.waitFor(500);
let img = await newPage.$eval(".selected img", (img) => {
return img.getAttribute("src");
});
imgArr.push(img);
}
dataObj.descriptions = descriptions;
dataObj.imgURls = imgArr;
resolve(dataObj);
await newPage.close();
});
for (link in urls) {
scrapedDataArr.push(await pagePromise(urls[link]));
if(scrapedDataArr.length === 5){
scrapedDataArr = JSON.stringify(scrapedDataArr)
fs.writeFile('products.js', scrapedDataArr, (err) => {
if(err){
console.log(err);
}else{
console.log("successfully wrote file")
}
})
break;
}
}
},
};
module.exports = scraperObject;