-
Notifications
You must be signed in to change notification settings - Fork 0
/
papers_scrapping.js
79 lines (67 loc) · 2.88 KB
/
papers_scrapping.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
function httpGet(theUrl){
var xmlHttp = new XMLHttpRequest();
var htmlToReturn = []
xmlHttp.open( "GET", theUrl, false ); // false for synchronous request
xmlHttp.send( null );
var response = xmlHttp.responseText;
var htmlObject = document.createElement('div');
htmlObject.innerHTML = response;
htmlToReturn.push(htmlObject)
htmlObject.remove()
return htmlToReturn;
}
function downloadObjectAsJson(exportObj, exportName){
var dataStr = "data:text/json;charset=utf-8," + encodeURIComponent(JSON.stringify(exportObj));
var downloadAnchorNode = document.createElement('a');
downloadAnchorNode.setAttribute("href", dataStr);
downloadAnchorNode.setAttribute("download", exportName + ".json");
document.body.appendChild(downloadAnchorNode); // required for firefox
downloadAnchorNode.click();
downloadAnchorNode.remove();
}
function getPapersData(currentView, pageNumber, page_url){
var URLs= currentView.querySelectorAll("pharos-heading a")
for(var i = 0; i < URLs.length ; i++){
try{
console.log("---------------------------------------")
console.log("Title number " + i.toString() + " Page " + pageNumber.toString())
var response = httpGet(URLs[i])[0] // list with one element
var title = response.querySelector(".title-font").innerText.trim()
var authors = response.querySelector(".author-font").innerText.trim()
var summary = response.querySelector("[class*='turn-away-content__summary']").innerText.trim().split("\n")
var paper = {}
paper.url = page_url
paper.title = title
paper.authors = authors
paper.summary = summary
console.log("############### DOWNLOADING ################")
downloadObjectAsJson(paper, "paper_" + i.toString() + "_page_number_" + pageNumber.toString())
}catch(e){console.log("error while requesting data")}
}
}
//create variables pagination
var allviews = [] // all pages to request
var pages_offest = 0 // when somehting goes wrong, star requests from this page
var view = document
var total_pages = 100 // total pages to request (100 is the max. This is a site problem)
var pages_counter = 0 + pages_offest
// Get data from first view
console.log("---------------------------------------")
console.log("getting view number " + pages_counter.toString())
getPapersData(view, pages_counter, window.location.href)
console.log("done!")
// Handle pagination
while(pages_counter!==total_pages){
try{
pages_counter+=1
console.log("---------------------------------------")
console.log("getting view number " + pages_counter.toString())
nextUrl = view.querySelectorAll("#next-page")[1].href // [div, div] --> elem 0 not works well
view = httpGet(nextUrl)[0] // list with one element
getPapersData(view, pages_counter, nextUrl)
}catch(e){
alert("error while requesting view number " + pages_counter.toString())
alert("url is: \n" + nextUrl)
window.open(nextUrl)
}
}