-
Notifications
You must be signed in to change notification settings - Fork 0
/
scholar_watcher_api.py
220 lines (184 loc) · 8.24 KB
/
scholar_watcher_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import json
import os
import time
from scholarly import scholarly
from configparser import ConfigParser
from datetime import datetime, timedelta, date
import requests
from bs4 import BeautifulSoup
import threading
class SearchEngine():
def __init__(self):
pass
def search(self, val, method):
if method == "id":
return self.searchByID(val)
def searchByID(self, user_id):
result = scholarly.search_author_id(user_id)
return result
def searchByPub(self, pub):
pass
def fetchRecentTopKPub(self, user_id, top_k):
# notice! This can only fetch the top k pubs ranked by citations not pub-time.
# Now it is deprecated. Use fetchLatestKPub() instead!
this_year_str = str(datetime.now().year)
result = self.searchByID(user_id)
author = scholarly.fill(result)
ret_pubs = {}
cnt = 0
for pub in author["publications"]:
if cnt >= top_k:
break
if "pub_year" in pub["bib"] and str(pub["bib"]["pub_year"]) == this_year_str:
ret_pubs[pub["bib"]["title"]] = {"pub_year": str(pub["bib"]["pub_year"]), "num_citations": str(pub["num_citations"])}
cnt += 1
return ret_pubs
# To be upgraded to Database
class Citation():
def __init__(self, path):
self.path = path
self.today = datetime.now().strftime("%Y-%m-%d")
# self.yesterday = (datetime.now() + timedelta(days=-1)).strftime("%Y-%m-%d")
if not os.path.isfile(self.path):
self.write({})
def read(self):
with open(self.path, "r", encoding="utf-8") as f:
d = json.load(f)
return d
def write(self, d):
with open(self.path, "w", encoding="utf-8") as f:
json.dump(d, f)
def update(self, author_id, author_name, citation):
d = self.read()
if author_id not in d:
d[author_id] = {"name":author_name}
d[author_id][self.today] = citation
self.write(d)
# compare with the last record (instead of yesterday)
def compare(self, author_id):
d = self.read()
today_citation = d[author_id][self.today]
last_citation = 0
try:
for last_record in sorted(d[author_id])[::-1]:
if "-" in last_record and last_record != self.today:
last_citation = d[author_id][last_record]
break
# yesterday_citation = d[author_id][self.yesterday]
except Exception as e:
print("Execption: ", e)
# yesterday_citation = 0
last_citation = 0
d[author_id]["increase"] = today_citation - last_citation
self.write(d)
return (today_citation, last_citation)
# return today's citation and the incrase for presentation
def present(self):
d_present = {}
d_all = self.read()
for author_id in d_all:
author_name = d_all[author_id]["name"]
if author_name not in d_present:
d_present[author_name] = {"citation":d_all[author_id][self.today], "increase":d_all[author_id]["increase"]}
return d_present
def getSequence(self, author_ids):
d_all = self.read()
seq = []
for author_id in author_ids:
citation_lines = []
for key in sorted(d_all[author_id]):
if "-" in key and key != "increase" and key != "name":
# print(d_all[author_id][key])
citation_lines.append(int(d_all[author_id][key]))
seq.append(citation_lines)
# print(seq)
return seq
def copyOneDay(self, date):
pass
# If yesterday is blank, then recursively copy the day before yesterday
def getPlotData(d_all, author_ids):
data = []
first_ptime, earliest_date = 1, datetime(2200, 1, 1)
for author_id in author_ids:
plot_data = {"X_DATA":[], "Y_DATA":[]}
for key in sorted(d_all[author_id]):
if "-" in key and key != "increase" and key != "name":
plot_data["Y_DATA"].append(int(d_all[author_id][key]))
ptime = time.strptime(key, "%Y-%m-%d")
plot_data["X_DATA"].append(datetime(ptime[0], ptime[1], ptime[2]))
if first_ptime == 1:
earliest_date = ptime
first_ptime = 0
if ptime < earliest_date:
earliest_date = ptime
data.append(plot_data)
for i in range(len(data)):
for j in range(len(data[i]["X_DATA"])):
data[i]["X_DATA"][j] = (data[i]["X_DATA"][j] - earliest_date).days
return (data, earliest_date)
def checkUpdate(searcher, citation, conf, single_author=None, force=False):
# today = time.localtime()[0:3]
today_str = datetime.now().strftime("%Y-%m-%d")
# last_modified = time.localtime(os.stat(citation.path).st_mtime)[0:3] # bug, sholdn"t rely on the modify time. should fetch from json
d_all = citation.read()
first_author = conf.options("Authors")[0]
if today_str not in d_all[conf["Authors"][first_author]] or force == True:
# if today != last_modified or force == True:
if single_author is not None:
author_id = conf["Authors"][single_author]
result = searcher.search(author_id, method="id")
citation.update(author_id, result["name"], result["citedby"])
today_citation, last_citation = citation.compare(author_id)
print("%s today citation: %d, last citation: %d, %d ⬆"%(result["name"], today_citation, last_citation, (today_citation-last_citation)))
else:
for author_label in conf["Authors"]:
author_id = conf["Authors"][author_label]
result = searcher.search(author_id, method="id")
citation.update(author_id, result["name"], result["citedby"])
today_citation, last_citation = citation.compare(author_id)
print("%s today citation: %d, last citation: %d, %d ⬆"%(result["name"], today_citation, last_citation, (today_citation-last_citation)))
def fetchLatestKPub(user_id, latest_k):
head = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
url = "https://scholar.google.com.hk/citations?user=" + user_id + "&view_op=list_works&sortby=pubdate"
r = requests.get(url, headers=head)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
recent_pubs = soup.find_all(class_="gsc_a_tr")
k = min(20, latest_k)
latest_pubs = {}
for i in range(k):
title = recent_pubs[i].contents[0].contents[0].string
pub_year = recent_pubs[i].contents[0].contents[2].contents[1].string.split(",")[-1]
num_citations = recent_pubs[i].contents[1].string
if num_citations is None:
num_citations = 0
latest_pubs[title] = {"pub_year": pub_year, "num_citations": num_citations}
return latest_pubs
def autoUpdateEveryDay(searcher, citation, conf):
print("\nAuto checkUpdate() once a day")
checkUpdate(searcher, citation, conf, single_author=None, force=True)
timer = threading.Timer(86400, autoUpdateEveryDay, (searcher, citation, conf))
timer.start()
# todo: solve child thread exit problem. or it would wait for a hole day.
def getSecondsToTime(hour, minute, second):
now = datetime.now()
target_time = datetime(now.year, now.month, now.day, hour, minute, second)
tomorrow_target_time = target_time + timedelta(days=1)
if now.hour <= hour and now.minute <= minute and now.second <= second:
rest_seconds = (target_time - now).seconds
else:
rest_seconds = (tomorrow_target_time - now).seconds
return rest_seconds
if __name__ == "__main__":
conf = ConfigParser()
conf.read("config.ini", encoding="utf-8")
Proxy = conf["Proxy"]
Authors = conf["Authors"]
Settings = conf["Settings"]
os.environ["http_proxy"] = Proxy["http_proxy"]
os.environ["https_proxy"] = conf["Proxy"]["https_proxy"]
searcher = SearchEngine()
citation = Citation(Settings["db_path"])
checkUpdate(searcher, citation, conf, force=True)
print(json.dumps(citation.read(), indent=4))