-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_rxives.py
38 lines (35 loc) · 1.42 KB
/
scrape_rxives.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# Script that looks through Covid19 preprints and checks whether
# they have been peer-reviewed and published in a journal
# Author: Lars Hubatsch, [email protected]
from bs4 import BeautifulSoup
from numpy import savetxt
from openpyxl import load_workbook
from selenium import webdriver
# Here goes the path to your excel file
wb = load_workbook('/Users/hubatsch/Desktop/Covid19.xlsx')
ws = wb['Sheet1']
# Loop through all rows in excel file, get link from first column
# check whether page contains 'Now published in', append number
# to published depending on outcome:
# 0 ... no link in row
# 1 ... link in row with peer-reviewed version
# -1 ... link in row without peer-reviewed version
published = []
for row in range(1, ws.max_row+1):
try:
print(ws.cell(row=row, column=1).hyperlink.target)
base_url = ws.cell(row=row, column=1).hyperlink.target
driver = webdriver.PhantomJS()
driver.get(base_url)
driver.implicitly_wait(100)
soup = BeautifulSoup(driver.page_source, 'html.parser')
a = soup.find_all('div', attrs={'class': 'pub_jnl'})
print(str(a[0]).find('Now published in'))
if str(a[0]).find('Now published in') == -1:
published.append(-1)
else:
published.append(1)
except (ValueError, AttributeError):
print('No link in row ' + str(row))
published.append(0)
savetxt('published.txt', published, fmt='%i',)