forked from xchewtoyx/calibre-comicvine
-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
362 lines (321 loc) · 12.4 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
"""
calibre_plugins.comicvine - A calibre metadata source for comicvine
"""
import logging
import random
import re
import time
import threading
from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils import logging as calibre_logging # pylint: disable=W0404
from calibre.utils.config import JSONConfig
from calibre_plugins.comicvine import pycomicvine
from calibre_plugins.comicvine.config import PREFS
from .pycomicvine.error import RateLimitExceededError
# Optional Import for fuzzy title matching
try:
import Levenshtein
except ImportError:
pass
class CalibreHandler(logging.Handler):
"""
python logging handler that directs messages to the calibre logging
interface
"""
def emit(self, record):
level = getattr(calibre_logging, record.levelname)
calibre_logging.default_log.prints(level, record.getMessage())
class TokenBucket(object):
def __init__(self):
self.lock = threading.RLock()
params = JSONConfig("plugins/comicvine_tokens")
params.defaults["tokens"] = 0
params.defaults["update"] = time.time()
self.params = params
def consume(self):
with self.lock:
self.params.refresh()
rate = PREFS["requests_rate"]
while self.tokens < 1:
if self.params["update"] + 1 / rate > time.time():
next_token = self.params["update"] + 1 / rate - time.time()
else:
next_token = 1 / rate
if rate != 1:
logging.warning(
"Slow down cowboy: %0.2f seconds to next token", next_token
)
time.sleep(next_token)
self.params["tokens"] -= 1
@property
def tokens(self):
with self.lock:
self.params.refresh()
if self.params["tokens"] < PREFS["requests_burst"]:
now = time.time()
elapsed = now - self.params["update"]
if elapsed > 0:
new_tokens = int(elapsed * PREFS["requests_rate"])
if new_tokens:
if new_tokens + self.params["tokens"] < PREFS["requests_burst"]:
self.params["tokens"] += new_tokens
else:
self.params["tokens"] = PREFS["requests_burst"]
self.params["update"] = now
return self.params["tokens"]
def retry_on_cv_error(retries=2):
"""Decorator for functions that access the comicvine api.
Retries the decorated function on error."""
def wrap_function(target_function):
"Closure for the retry function giving access to decorator arguments."
def retry_function(*args, **kwargs):
"""Decorate function to retry on error.
The comicvine API can be a little flaky, so retry on error to make
sure the error is real.
If retries is exceeded will raise the original exception.
"""
for retry in range(1, retries + 1):
try:
return target_function(*args, **kwargs)
except RateLimitExceededError:
logging.warning("API Rate limited exceeded.")
raise
except Exception:
logging.warning(
"Calling %r failed on attempt %d/%d with args: %r %r",
target_function,
retry,
retries,
args,
kwargs,
)
if retry == retries:
raise
# Failures may be due to busy servers. Be a good citizen and
# back off for 100-600 ms before retrying.
time.sleep(random.random() / 2 + 0.1)
else:
break
return retry_function
return wrap_function
# @retry_on_cv_error()
def build_meta(log, issue):
"""Build metadata record based on comicvine issue_id"""
""" issue = pycomicvine.Issue(issue_id, field_list=[
'id', 'name', 'volume', 'issue_number', 'person_credits', 'description',
'store_date', 'cover_date']) """
if not issue or not issue.volume:
log.warn(f"Unable to load Issue({issue})")
return None
title = f"{issue.volume.name} #{issue.issue_number}"
if issue.name:
title = title + f": {issue.name}"
authors = [p.name for p in issue.person_credits]
meta = Metadata(title, authors)
meta.series = issue.volume.name
meta.series_index = str(issue.issue_number)
meta.set_identifier("comicvine", str(issue.id))
meta.set_identifier("comicvine-volume", str(issue.volume.id))
if issue.description:
meta.comments = issue.description
else:
meta.comments = issue.volume.description
if issue.image:
meta.has_cover = True
else:
meta.has_cover = False
if issue.volume.publisher:
meta.publisher = issue.volume.publisher.name
meta.pubdate = issue.store_date or issue.cover_date
return meta
@retry_on_cv_error()
def find_volumes(volume_title, log, volumeid=None):
"""Look up volumes matching title string"""
candidate_volumes = []
if volumeid:
log.debug(f"Looking up volume: {volumeid}")
candidate_volumes = [pycomicvine.Volume(volumeid, all=True)]
else:
log.debug(f"Looking up volume: {volume_title}")
matches = pycomicvine.Volumes.search(
query=volume_title,
field_list=["id", "name", "count_of_issues", "publisher"],
)
max_matches = PREFS["max_volumes"] - 1
for i, match in enumerate(matches):
try:
if match:
candidate_volumes.append(match)
if i >= max_matches:
break
except IndexError:
continue
log.debug(f"found {len(candidate_volumes)} volume matches")
return candidate_volumes
@retry_on_cv_error()
def find_issues(candidate_volumes, issue_number, log):
"""Find issues in candidate volumes matching issue_number"""
candidate_issues = []
issue_filter = [
"volume:%s" % ("|".join(str(volume.id) for volume in candidate_volumes))
]
if issue_number is not None:
issue_filter.append(f"issue_number:{issue_number}")
filter_string = ",".join(issue_filter)
log.debug(f"Searching for Issues({filter_string})")
candidate_issues = candidate_issues + list(
pycomicvine.Issues(
filter=filter_string,
field_list=[
"id",
"name",
"volume",
"issue_number",
"person_credits",
"description",
"store_date",
"cover_date",
"image",
],
all=True,
)
)
log.debug(f"{len(candidate_issues)} matches found")
return candidate_issues
def normalised_title(query, title):
"""
returns (issue_number,title_tokens)
This method takes the provided title and breaks it down into
searchable components. The issue number should be preceeded by a
'#' mark or it will be treated as a word in the title. Anything
provided after the issue number (e.g. a sub-title) will be
ignored.
"""
title_tokens = []
issue_number = None
replacements = (
(r"((?:^|\s)(?:\w\.){2,})", lambda match: match.group(0).replace(".", "")),
(r"\s\(?of \d+\)?", ""),
(r"(?:v|vol)\s?\d+", ""),
(r"\([^)]+\)", ""),
("(?:# ?)?0*([\d\xbd]+[^:\s]*):?[^\d]*$", "#\g<1>"),
(r"\s{2,}", " "),
)
if isinstance(title, str):
for pattern, replacement in replacements:
title = re.sub(pattern, replacement, title)
issue_pattern = re.compile("#([^:\s]+)")
issue_match = issue_pattern.search(title)
if issue_match:
issue_number = issue_match.group(1)
title = issue_pattern.sub("", title)
for token in query.get_title_tokens(title):
title_tokens.append(token.lower())
return issue_number, title_tokens
return 0, ""
def find_title(query, title, log, volumeid=None):
"""Extract volume name and issue number from issue title"""
(issue_number, title_tokens) = normalised_title(query, title)
log.debug(f"Searching for {title_tokens} #{issue_number}")
if volumeid:
volumeid = int(volumeid)
"""
- edit - issue number preceded by # returns empty search too often
"""
candidate_volumes = find_volumes(
" AND ".join(title_tokens) + " AND " + str(issue_number), log, volumeid
)
return (issue_number, candidate_volumes)
def build_term(build_type, parts):
"""function to build search terms"""
if build_type == "author":
return " ".join(x for x in parts)
else:
return " ".join(x for x in parts)
@retry_on_cv_error()
def find_authors(query, authors, log):
"""Find people matching author string"""
candidate_authors = []
log.debug(f"Authors {authors}")
for author_name in authors:
name_tokens = None
log.debug(f"Author {author_name}")
a_tokens = query.get_author_tokens([author_name], only_first_author=False)
if a_tokens:
name_tokens = build_term("author", a_tokens)
if name_tokens and name_tokens != "Unknown":
log.debug(f"Searching for author: {name_tokens}")
aperson = pycomicvine.People(
filter="name:%s" % (name_tokens), field_list=["id"]
)
if aperson:
candidate_authors.append(pycomicvine.Person(aperson[0].id))
log.debug(f"{len(candidate_authors)} matches found")
return candidate_authors
def score_title(metadata, title=None, issue_number=None, title_tokens=None):
"""
Calculate title matching ranking
"""
score = 0
volume = f"{metadata.series.lower()} #{metadata.series_index}"
match_year = re.compile(r"\((\d{4})\)")
year = match_year.search(title)
if year:
title = match_year.sub("", title)
if metadata.pubdate:
score += abs(metadata.pubdate.year - int(year.group(1)))
else:
score += 10 # penalise entries with no publication date
score += abs(len(volume) - len(title))
for token in title_tokens:
if token not in volume:
score += 10
try:
similarity = Levenshtein.ratio(str(volume), str(title))
score += 100 - int(100 * similarity)
except NameError:
pass
if issue_number is not None and metadata.series_index != issue_number:
score += 50
if metadata.series_index not in title:
score += 10
# De-preference TPBs by looking for the phrases "collecting issues",
# "containing issues", etc. in the comments
# TODO(rgh): This should really be controlled by config
collection = re.compile(r"(?:collect|contain)(?:s|ing) issues")
if metadata.comments and collection.search(metadata.comments.lower()):
score += 50
return score
def keygen(metadata, title=None, authors=None, identifiers=None, **kwargs):
"""
Implement multi-result comparisons.
1. Prefer an entry where the comicvine id matches
2. Prefer similar titles using Levenshtein ratio (if module available)
3. Penalise entries where the issue number is not in the title
4. Prefer matching authors (the more matches, the higher the preference)
"""
score = 0
if identifiers:
try:
if metadata.get_identifier("comicvine") == identifiers["comicvine"]:
return 0
except (KeyError, AttributeError):
pass
if title:
score += score_title(metadata, title=title, **kwargs)
if authors:
for author in authors:
if author not in metadata.authors:
score += 10
return score
# Do not include the retry decorator for generator, as exceptions in
# generators are always fatal. Functions that use this should be
# decorated instead.
def cover_urls(comicvine_id, get_best_cover=False):
"Retrieve cover urls for comic in quality order"
issue = pycomicvine.Issue(int(comicvine_id), field_list=["image"])
for url in ["super_url", "medium_url", "small_url"]:
if url in issue.image:
yield issue.image[url]
if get_best_cover:
break