Skip to content

Commit

Permalink
Do not fail while importing OPDS 2.x \ ODL 2.x publications with inco…
Browse files Browse the repository at this point in the history
…rrect identifiers (#21)
  • Loading branch information
vbessonov authored Sep 7, 2021
1 parent 28ac191 commit 44b335f
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 18 deletions.
83 changes: 71 additions & 12 deletions opds2_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from flask_babel import lazy_gettext as _
from io import StringIO, BytesIO
from urllib.parse import urljoin, urlparse

from typing import Dict, List
from webpub_manifest_parser.core import ManifestParserFactory, ManifestParserResult
from webpub_manifest_parser.core.analyzer import NodeFinder
from webpub_manifest_parser.core.ast import Manifestlike
Expand Down Expand Up @@ -809,6 +811,58 @@ def _is_open_access_link_(link_data, circulation_data):

return open_access_rights_link

def _record_coverage_failure(
self,
failures: Dict[str, List[CoverageFailure]],
identifier: Identifier,
error_message: str,
transient: bool = True,
) -> CoverageFailure:
"""Record a new coverage failure.
:param failures: Dictionary mapping publication identifiers to corresponding CoverageFailure objects
:type failures: Dict[str, List[CoverageFailure]]
:param identifier: Publication's identifier
:type identifier: Identifier
:param error_message: Message describing the failure
:type error_message: str
:param transient: Boolean value indicating whether the failure is final or it can go away in the future
:type transient: bool
:return: CoverageFailure object describing the error
:rtype: CoverageFailure
"""
if identifier not in failures:
failures[identifier.identifier] = []

failure = CoverageFailure(
identifier,
error_message,
data_source=self.data_source,
transient=transient,
)
failures[identifier.identifier].append(failure)

return failure

def _record_publication_unrecognizable_identifier(self, publication: opds2_ast.OPDS2Publication) -> None:
"""Record a publication's unrecognizable identifier, i.e. identifier that has an unknown format
and could not be parsed by CM.
:param publication: OPDS 2.x publication object
:type publication: opds2_ast.OPDS2Publication
"""
original_identifier = publication.metadata.identifier
title = publication.metadata.title

if original_identifier is None:
self._logger.warning(f"Publication '{title}' does not have an identifier.")
else:
self._logger.warning(f"Publication # {original_identifier} ('{title}') has an unrecognizable identifier.")

def extract_next_links(self, feed):
"""Extracts "next" links from the feed.
Expand Down Expand Up @@ -867,6 +921,13 @@ def extract_feed_data(self, feed, feed_url=None):
failures = {}

for publication in self._get_publications(feed):
recognized_identifier = self._extract_identifier(publication)

# TODO: Identifier type based filtration will be implemented in the next PR.
if not recognized_identifier or recognized_identifier.type != Identifier.ISBN:
self._record_publication_unrecognizable_identifier(publication)
continue

publication_metadata = self._extract_publication_metadata(
feed, publication, self.data_source_name
)
Expand All @@ -883,19 +944,17 @@ def extract_feed_data(self, feed, feed_url=None):
)

if publication:
identifier = self._extract_identifier(publication)

if identifier:
if publication.metadata.identifier not in failures:
failures[identifier.identifier] = []

failure = CoverageFailure(
identifier,
error.error_message,
data_source=self.data_source,
transient=False,
recognized_identifier = self._extract_identifier(publication)

# TODO: Identifier type based filtration will be implemented in the next PR.
if not recognized_identifier or recognized_identifier.type != Identifier.ISBN:
self._record_publication_unrecognizable_identifier(publication)
else:
self._record_coverage_failure(
failures, recognized_identifier, error.error_message
)
failures[identifier.identifier].append(failure)
else:
self._logger.warning(f"{error.error_message}")

return publication_metadata_dictionary, failures

Expand Down
10 changes: 7 additions & 3 deletions opds_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,15 @@ def parse_identifier(db, identifier):
:type identifier: str
:return: Identifier object
:rtype: core.model.identifier.Identifier
:rtype: Optional[core.model.identifier.Identifier]
"""
identifier, _ = Identifier.parse_urn(db, identifier)
parsed_identifier = None
result = Identifier.parse_urn(db, identifier)

return identifier
if result is not None:
parsed_identifier, _ = result

return parsed_identifier


class AccessNotAuthenticated(Exception):
Expand Down
6 changes: 3 additions & 3 deletions requirements-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ py-bcrypt==0.4
pymarc==4.0.0
pyparsing==2.4.7
pyspellchecker==0.5.5
python-dateutil==2.8.1
pytz==2020.1
python-dateutil==2.8.2
pytz==2021.1
rdflib==5.0.0
requests==2.24.0
requests-futures==1.0.0
Expand All @@ -62,7 +62,7 @@ watchtower==0.8.0
Werkzeug==1.0.1

multipledispatch==0.6.0
palace-webpub-manifest-parser==2.0.1
palace-webpub-manifest-parser==2.0.3

# Theses dependencies are only used in our tests. Ideally they would get moved out to
# the dev requirements file, but the mock classes for our tests are included alongside
Expand Down

0 comments on commit 44b335f

Please sign in to comment.