Skip to content

Commit

Permalink
Add support for ODL feeds serialized using OPDS 2.x (#10)
Browse files Browse the repository at this point in the history
- new ODL2API collection designed for ODL 2.x feeds
- new ODL2Importer class responsible for importing ODL 2.x feeds
- new odl2_import_monitor script designed for automating the process of importing ODL 2.x feeds.
  • Loading branch information
vbessonov authored Jul 30, 2021
1 parent 586e2ae commit 15043e3
Show file tree
Hide file tree
Showing 14 changed files with 916 additions and 100 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,8 @@ docs/**/
docs/source/*
!docs/source/conf.py
!docs/source/index.rst


# Mac OS
.DS_Store

2 changes: 2 additions & 0 deletions api/admin/controller/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
ODLAPI,
SharedODLAPI,
)
from api.odl2 import ODL2API
from api.opds_for_distributors import OPDSForDistributorsAPI
from api.overdrive import OverdriveAPI
from api.proquest.importer import ProQuestOPDS2Importer
Expand Down Expand Up @@ -1317,6 +1318,7 @@ class SettingsController(AdminCirculationManagerController):
RBDigitalAPI,
EnkiAPI,
ODLAPI,
ODL2API,
SharedODLAPI,
FeedbooksOPDSImporter,
LCPAPI
Expand Down
2 changes: 2 additions & 0 deletions api/circulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ def default_api_map(self):
from .enki import EnkiAPI
from .opds_for_distributors import OPDSForDistributorsAPI
from .odl import ODLAPI, SharedODLAPI
from .odl2 import ODL2API
from api.lcp.collection import LCPAPI
from api.proquest.importer import ProQuestOPDS2Importer

Expand All @@ -449,6 +450,7 @@ def default_api_map(self):
EnkiAPI.ENKI_EXTERNAL : EnkiAPI,
OPDSForDistributorsAPI.NAME: OPDSForDistributorsAPI,
ODLAPI.NAME: ODLAPI,
ODL2API.NAME: ODL2API,
SharedODLAPI.NAME: SharedODLAPI,
LCPAPI.NAME: LCPAPI,
ProQuestOPDS2Importer.NAME: ProQuestOPDS2Importer
Expand Down
272 changes: 272 additions & 0 deletions api/odl2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
import json
import logging

from contextlib2 import contextmanager
from flask_babel import lazy_gettext as _
from webpub_manifest_parser.opds2.registry import OPDS2LinkRelationsRegistry

from api.odl import ODLAPI
from core.metadata_layer import FormatData, LicenseData
from core.model import DeliveryMechanism, Edition, MediaTypes, RightsStatus
from core.model.configuration import (
ConfigurationAttributeType,
ConfigurationFactory,
ConfigurationGrouping,
ConfigurationMetadata,
ConfigurationStorage,
HasExternalIntegration,
)
from core.opds2_import import OPDS2Importer, OPDS2ImportMonitor
from core.util import first_or_default


class ODL2APIConfiguration(ConfigurationGrouping):
skipped_license_formats = ConfigurationMetadata(
key="odl2_skipped_license_formats",
label=_("License formats"),
description=_("Name of the data source associated with this collection."),
type=ConfigurationAttributeType.LIST,
required=False,
default=["text/html"],
)


class ODL2API(ODLAPI):
NAME = "ODL 2.0"
SETTINGS = ODLAPI.SETTINGS + ODL2APIConfiguration.to_settings()


class ODL2Importer(OPDS2Importer, HasExternalIntegration):
"""Import information and formats from an ODL feed.
The only change from OPDS2Importer is that this importer extracts
FormatData and LicenseData from ODL 2.x's "licenses" arrays.
"""

NAME = ODL2API.NAME

FEEDBOOKS_AUDIO = "{0}; protection={1}".format(
MediaTypes.AUDIOBOOK_MANIFEST_MEDIA_TYPE,
DeliveryMechanism.FEEDBOOKS_AUDIOBOOK_DRM,
)

CONTENT_TYPE = "content-type"
DRM_SCHEME = "drm-scheme"

LICENSE_FORMATS = {
FEEDBOOKS_AUDIO: {
CONTENT_TYPE: MediaTypes.AUDIOBOOK_MANIFEST_MEDIA_TYPE,
DRM_SCHEME: DeliveryMechanism.FEEDBOOKS_AUDIOBOOK_DRM
}
}

def __init__(
self,
db,
collection,
parser,
data_source_name=None,
identifier_mapping=None,
http_get=None,
metadata_client=None,
content_modifier=None,
map_from_collection=None,
mirrors=None,
):
"""Initialize a new instance of ODL2Importer class.
:param db: Database session
:type db: sqlalchemy.orm.session.Session
:param collection: Circulation Manager's collection.
LicensePools created by this OPDS2Import class will be associated with the given Collection.
If this is None, no LicensePools will be created -- only Editions.
:type collection: Collection
:param parser: Feed parser
:type parser: RWPMManifestParser
:param data_source_name: Name of the source of this OPDS feed.
All Editions created by this import will be associated with this DataSource.
If there is no DataSource with this name, one will be created.
NOTE: If `collection` is provided, its .data_source will take precedence over any value provided here.
This is only for use when you are importing OPDS metadata without any particular Collection in mind.
:type data_source_name: str
:param identifier_mapping: Dictionary used for mapping external identifiers into a set of internal ones
:type identifier_mapping: Dict
:param metadata_client: A SimplifiedOPDSLookup object that is used to fill in missing metadata
:type metadata_client: SimplifiedOPDSLookup
:param content_modifier: A function that may modify-in-place representations (such as images and EPUB documents)
as they come in from the network.
:type content_modifier: Callable
:param map_from_collection: Identifier mapping
:type map_from_collection: Dict
:param mirrors: A dictionary of different MirrorUploader objects for different purposes
:type mirrors: Dict[MirrorUploader]
"""
super(ODL2Importer, self).__init__(
db,
collection,
parser,
data_source_name,
identifier_mapping,
http_get,
metadata_client,
content_modifier,
map_from_collection,
mirrors,
)

self._logger = logging.getLogger(__name__)

self._configuration_storage = ConfigurationStorage(self)
self._configuration_factory = ConfigurationFactory()

@contextmanager
def _get_configuration(self, db):
"""Return the configuration object.
:param db: Database session
:type db: sqlalchemy.orm.session.Session
:return: Configuration object
:rtype: ODL2APIConfiguration
"""
with self._configuration_factory.create(
self._configuration_storage, db, ODL2APIConfiguration
) as configuration:
yield configuration

def _extract_publication_metadata(self, feed, publication, data_source_name):
"""Extract a Metadata object from webpub-manifest-parser's publication.
:param publication: Feed object
:type publication: opds2_ast.OPDS2Feed
:param publication: Publication object
:type publication: opds2_ast.OPDS2Publication
:param data_source_name: Data source's name
:type data_source_name: str
:return: Publication's metadata
:rtype: Metadata
"""
metadata = super(ODL2Importer, self)._extract_publication_metadata(
feed, publication, data_source_name
)
formats = []
licenses = []
licenses_owned = 0
licenses_available = 0
medium = None

with self._get_configuration(self._db) as configuration:
skipped_license_formats = configuration.skipped_license_formats

if skipped_license_formats:
skipped_license_formats = set(json.loads(skipped_license_formats))

if publication.licenses:
for license in publication.licenses:
identifier = license.metadata.identifier

for license_format in license.metadata.formats:
if (
skipped_license_formats
and license_format in skipped_license_formats
):
continue

if not medium:
medium = Edition.medium_from_media_type(license_format)

drm_schemes = (
license.metadata.protection.formats
if license.metadata.protection
else []
)

if license_format in self.LICENSE_FORMATS:
drm_scheme = self.LICENSE_FORMATS[license_format][self.DRM_SCHEME]
license_format = self.LICENSE_FORMATS[license_format][self.CONTENT_TYPE]

drm_schemes.append(drm_scheme)

for drm_scheme in drm_schemes or [None]:
formats.append(
FormatData(
content_type=license_format,
drm_scheme=drm_scheme,
rights_uri=RightsStatus.IN_COPYRIGHT,
)
)

expires = None
remaining_checkouts = None
available_checkouts = None
concurrent_checkouts = None

checkout_link = first_or_default(
license.links.get_by_rel(OPDS2LinkRelationsRegistry.BORROW.key)
)
if checkout_link:
checkout_link = checkout_link.href

odl_status_link = first_or_default(
license.links.get_by_rel(OPDS2LinkRelationsRegistry.SELF.key)
)
if odl_status_link:
odl_status_link = odl_status_link.href

if odl_status_link:
status_code, _, response = self.http_get(
odl_status_link, headers={}
)

if status_code < 400:
status = json.loads(response)
checkouts = status.get("checkouts", {})
remaining_checkouts = checkouts.get("left")
available_checkouts = checkouts.get("available")

if license.metadata.terms:
expires = license.metadata.terms.expires
concurrent_checkouts = license.metadata.terms.concurrency

licenses_owned += int(concurrent_checkouts or 0)
licenses_available += int(available_checkouts or 0)

licenses.append(
LicenseData(
identifier=identifier,
checkout_url=checkout_link,
status_url=odl_status_link,
expires=expires,
remaining_checkouts=remaining_checkouts,
concurrent_checkouts=concurrent_checkouts,
)
)

metadata.circulation.licenses_owned = licenses_owned
metadata.circulation.licenses_available = licenses_available
metadata.circulation.licenses = licenses
metadata.circulation.formats.extend(formats)
metadata.medium = medium

return metadata

def external_integration(self, db):
return self.collection.external_integration


class ODL2ImportMonitor(OPDS2ImportMonitor):
"""Import information from an ODL feed."""

PROTOCOL = ODL2Importer.NAME
SERVICE_NAME = "ODL 2.x Import Monitor"
20 changes: 18 additions & 2 deletions api/proquest/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
ExternalIntegration,
HasExternalIntegration,
)
from core.opds2_import import OPDS2Importer, OPDS2ImportMonitor, parse_feed
from core.opds2_import import OPDS2Importer, OPDS2ImportMonitor, RWPMManifestParser
from core.opds_import import OPDSImporter

MISSING_AFFILIATION_ID = BaseError(
Expand Down Expand Up @@ -182,6 +182,7 @@ def __init__(
self,
db,
collection,
parser,
data_source_name=None,
identifier_mapping=None,
http_get=None,
Expand All @@ -200,6 +201,9 @@ def __init__(
If this is None, no LicensePools will be created -- only Editions.
:type collection: Collection
:param parser: Feed parser
:type parser: RWPMManifestParser
:param data_source_name: Name of the source of this OPDS feed.
All Editions created by this import will be associated with this DataSource.
If there is no DataSource with this name, one will be created.
Expand All @@ -226,6 +230,7 @@ def __init__(
super(ProQuestOPDS2Importer, self).__init__(
db,
collection,
parser,
data_source_name,
identifier_mapping,
http_get,
Expand Down Expand Up @@ -766,6 +771,7 @@ def __init__(
db,
collection,
import_class,
parser,
force_reimport=False,
process_removals=False,
**import_class_kwargs
Expand All @@ -784,6 +790,9 @@ def __init__(
:param import_class: Class containing the import logic
:type import_class: Type
:param parser: Feed parser
:type parser: RWPMManifestParser
:param force_reimport: Boolean value indicating whether the import process must be started from scratch
:type force_reimport: bool
Expand All @@ -792,6 +801,11 @@ def __init__(
that are no longer present in the ProQuest feed from the CM's catalog
:type process_removals: bool
"""
if not isinstance(parser, RWPMManifestParser):
raise ValueError("Argument 'parser' must be an instance of {0}".format(RWPMManifestParser))

import_class_kwargs["parser"] = parser

super(ProQuestOPDS2ImportMonitor, self).__init__(
db, collection, import_class, force_reimport, **import_class_kwargs
)
Expand All @@ -800,6 +814,7 @@ def __init__(
self._feeds = None
self._client = self._client_factory.create(self)
self._process_removals = process_removals
self._parser = parser

self._logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -901,7 +916,8 @@ def _parse_feed_page(self, page, feed_page_file):
) as feed_page_file_handle:
feed = feed_page_file_handle.read()

feed = parse_feed(feed, silent=False)
parser_result = self._parser.parse_manifest(feed)
feed = parser_result.root

self._logger.info("Page # {0}. Finished parsing the feed".format(page))

Expand Down
Loading

0 comments on commit 15043e3

Please sign in to comment.