-
Notifications
You must be signed in to change notification settings - Fork 0
/
usingDOI_download_Scihub.py
114 lines (89 loc) · 3.71 KB
/
usingDOI_download_Scihub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
dùng doi để download fulltext từ scihub
"""
from os.path import join, getsize
from os import listdir, walk, remove
from colorama import Fore
from lib.scihub import SciHub
from lib.config import PMID2DOI_FILE_PATH, PMID_HGMD, PMID_SIMI, PMID_NEGA
folder_save_pdf = "data/fulltext/similar_pdfs"
def get_list_pmid(path:str) -> list:
"""
load file crawl title, abstract để lấy pmid
"""
with open(path, 'r', encoding= 'utf-8') as f:
lines = f.read().strip().split('\n')
pmids = []
for i, line in enumerate(lines):
if i%4 == 0:
if not line.isdecimal():
print(Fore.CYAN + str(i), Fore.BLUE + line) #PMID
print(Fore.RED + lines[i+1])
print(Fore.LIGHTGREEN_EX + lines[i+2])
exit()
pmids.append(line.strip())
return pmids
def get_crawled_fulltext_pmid(folder) -> tuple:
# lấy danh sách paper và maxsize đã download full path
total_size = 0 # đếm dung lượng folder
total_pdf = 0 # đếm số lượng file trong folder
pmids = []
for root, _, files in walk(folder):
for file in files:
if file.endswith('.pdf'):
pmid = file.strip('.pdf')
path_pdf = join(root, file)
if pmid in pmids:
remove(path_pdf) # xóa file trùng
print(Fore.RED, 'remove ', pmid)
else:
total_size += getsize(path_pdf)
total_pdf += 1
return pmids, total_size, total_pdf
def get_pmid_sent_request() -> list:
# đọc file PMID2DOI_FILE_PATH trong config để lấy pmid đã sent_request
with open(PMID2DOI_FILE_PATH, 'r', encoding= 'utf-8') as f:
lines = f.read().strip().split('\n')
newlines = []
pmids = []
for line in lines:
pmid = line.split(',')[0].strip()
if pmid not in pmids:
pmids.append(pmid) # return
newlines.append(line) # record in csv file
with open(PMID2DOI_FILE_PATH, 'w', encoding= 'utf-8') as f:
f.write('\n'.join(newlines) + '\n')
return pmids
if __name__ == "__main__":
pmid_hgmd = get_list_pmid(PMID_HGMD)
# pmid_similar = get_list_pmid(PMID_SIMI)
# pmid_negative = get_list_pmid(PMID_NEGA)
max_size = round(40* 2**30)
sh = SciHub()
pmid_crawls, size, total_pdf = get_crawled_fulltext_pmid(folder_save_pdf) # pmid đã craws và size
pmid_crawls = set(pmid_crawls + get_pmid_sent_request())
print(Fore.RED, "{:.2f} Gigabytes / {} pdfs".format((size/2**30), total_pdf), Fore.RESET)
# duyệt qua từng pmid để download
for pmid in pmid_hgmd:
if pmid in pmid_crawls:
# bỏ qua pmid đã send request, tìm chúng trong danh sách doi
# bỏ qua pmid đã crawl xong pdf
continue
# implement download from pmc
# elif download from pubmed
# else download from scihub(using doi)
try:
doi = SciHub.pmid2doi(pmid) # 10.1200/JCO.2005.02.093
if doi != "":
pdf_path_save = join(folder_save_pdf, pmid + '.pdf')
size_pdf = sh.download(doi, folder_save_pdf, pdf_path_save)
if size_pdf != 0:
print("download success ", pdf_path_save)
# check dừng crawl vì full ổ cứng
size += size_pdf
if size >= max_size:
# dừng crawl vì full dung lượng ổ cứng
print("stop crawling because of SSD is full", size/2*30)
break
except Exception as e:
print(e)