From 85ee6a04a5234cca7ea7dd610936708c200ef530 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Sun, 10 Apr 2022 09:51:22 +0200 Subject: [PATCH 1/4] Bump up dm-schemas to version 0.2.0 --- reqs/requirements-prod.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reqs/requirements-prod.txt b/reqs/requirements-prod.txt index 624459e..be55905 100644 --- a/reqs/requirements-prod.txt +++ b/reqs/requirements-prod.txt @@ -5,4 +5,4 @@ feedparser==6.0.8 # Private packages git+ssh://git@github.com/dialect-map/dialect-map-io.git@v0.3.3#egg=dialect-map-io[gcp] -git+ssh://git@github.com/dialect-map/dialect-map-schemas.git@v0.1.5#egg=dialect-map-schemas +git+ssh://git@github.com/dialect-map/dialect-map-schemas.git@v0.2.0#egg=dialect-map-schemas From dcc859054160db60e7448743aed14d4fa9b59494 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Sun, 10 Apr 2022 10:14:20 +0200 Subject: [PATCH 2/4] Adapt API operator to dm-schemas 0.2.0 --- src/job/output/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/job/output/api.py b/src/job/output/api.py index 7eed17c..81bf152 100644 --- a/src/job/output/api.py +++ b/src/job/output/api.py @@ -54,7 +54,7 @@ def create_record(self, record_route: APIRoute, record_data: dict) -> None: :param record_route: data record route """ - record_schema = record_route.model_schema() + record_schema = record_route.schema() record_data = record_schema.load(record_data) self._create( @@ -69,7 +69,7 @@ def archive_record(self, record_route: APIRoute, record_data: dict) -> None: :param record_route: data record route """ - record_schema = record_route.model_schema() + record_schema = record_route.schema() schema_id_field = record_schema.schema_id self._archive( From df214b2af100eb5e718c34e26d9b14668c371070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Sun, 10 Apr 2022 10:01:33 +0200 Subject: [PATCH 3/4] Simplify metadata-routine methods --- src/job/mapping/__init__.py | 5 ----- src/job/mapping/record_routes.py | 10 ---------- src/routines.py | 28 +++++++++------------------- 3 files changed, 9 insertions(+), 34 deletions(-) delete mode 100644 src/job/mapping/__init__.py delete mode 100644 src/job/mapping/record_routes.py diff --git a/src/job/mapping/__init__.py b/src/job/mapping/__init__.py deleted file mode 100644 index 4b41cbc..0000000 --- a/src/job/mapping/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -*- coding: utf-8 -*- - -from .record_routes import CATEGORY_MEMBER_ROUTE -from .record_routes import PAPER_AUTHOR_ROUTE -from .record_routes import PAPER_ROUTE diff --git a/src/job/mapping/record_routes.py b/src/job/mapping/record_routes.py deleted file mode 100644 index 437b875..0000000 --- a/src/job/mapping/record_routes.py +++ /dev/null @@ -1,10 +0,0 @@ -# -*- coding: utf-8 -*- - -from dialect_map_schemas.routes import DM_CATEGORY_MEMBER_ROUTE -from dialect_map_schemas.routes import DM_PAPER_AUTHOR_ROUTE -from dialect_map_schemas.routes import DM_PAPER_ROUTE - - -CATEGORY_MEMBER_ROUTE = DM_CATEGORY_MEMBER_ROUTE -PAPER_AUTHOR_ROUTE = DM_PAPER_AUTHOR_ROUTE -PAPER_ROUTE = DM_PAPER_ROUTE diff --git a/src/routines.py b/src/routines.py index 8c06271..97d14f8 100644 --- a/src/routines.py +++ b/src/routines.py @@ -7,13 +7,11 @@ from typing import List from dialect_map_io.data_output import TextFileWriter +from dialect_map_schemas.routes import DM_PAPER_METADATA_ROUTE from job.files import FileSystemIterator from job.input import BaseMetadataSource from job.input import PDFCorpusSource -from job.mapping import CATEGORY_MEMBER_ROUTE -from job.mapping import PAPER_AUTHOR_ROUTE -from job.mapping import PAPER_ROUTE from job.models import ArxivMetadata from job.output import DialectMapOperator from job.output import LocalFileOperator @@ -81,21 +79,6 @@ def __init__(self, file_iter: FileSystemIterator, api_ctl: DialectMapOperator): self.api_controller = api_ctl self.sources = [] # type: ignore - def _dispatch_record(self, record: ArxivMetadata) -> None: - """ - Dispatch metadata record to the destination API - :param record: metadata record to dispatch - """ - - # The paper record must be inserted first - self.api_controller.create_record(PAPER_ROUTE, record.paper_record) - - for membership in record.memberships_records: - self.api_controller.create_record(CATEGORY_MEMBER_ROUTE, membership) - - for author in record.author_records: - self.api_controller.create_record(PAPER_AUTHOR_ROUTE, author) - def _get_metadata_records(self, paper_id: str) -> List[ArxivMetadata]: """ Gets the metadata records from the sources given an ArXiv paper ID @@ -135,4 +118,11 @@ def run(self, *args) -> None: continue for record in records: - self._dispatch_record(record) + self.api_controller.create_record( + DM_PAPER_METADATA_ROUTE, + { + "paper": record.paper_record, + "authors": record.author_records, + "memberships": record.memberships_records, + }, + ) From 4984a8240db465e0d8227e930a51b38df9ac5e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Sun, 10 Apr 2022 11:45:07 +0200 Subject: [PATCH 4/4] Adapt ArxivMetadata model class --- src/job/models/arxiv.py | 26 ++++++++++++++++++-------- src/routines.py | 6 +----- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/job/models/arxiv.py b/src/job/models/arxiv.py index c297798..0099f6b 100644 --- a/src/job/models/arxiv.py +++ b/src/job/models/arxiv.py @@ -8,6 +8,7 @@ from dialect_map_schemas import CategoryMembershipSchema from dialect_map_schemas import PaperSchema from dialect_map_schemas import PaperAuthorSchema +from dialect_map_schemas import PaperMetadataSchema @dataclass @@ -74,8 +75,19 @@ class ArxivMetadata: paper_updated_at: datetime @property - def paper_record(self) -> dict: - """Adapts the ArXiv metadata object into a Paper record""" + def paper_metadata(self) -> dict: + """Adapts the ArXiv metadata object into a PaperMetadata record""" + + schema = PaperMetadataSchema() + + return { + schema.paper.name: self._build_paper_record(), + schema.authors.name: self._build_author_records(), + schema.memberships.name: self._build_membership_records(), + } + + def _build_paper_record(self) -> dict: + """Builds an ArXiv paper dictionary out of the ArXiv metadata object""" schema = PaperSchema() @@ -90,9 +102,8 @@ def paper_record(self) -> dict: schema.updated_at.name: self.paper_updated_at.isoformat(), } - @property - def author_records(self) -> Iterable[dict]: - """Adapts the ArXiv metadata object into a list of PaperAuthor records""" + def _build_author_records(self) -> Iterable[dict]: + """Builds an ArXiv author dictionary out of the ArXiv metadata object""" schema = PaperAuthorSchema() @@ -104,9 +115,8 @@ def author_records(self) -> Iterable[dict]: schema.created_at.name: self.paper_created_at.isoformat(), } - @property - def memberships_records(self) -> Iterable[dict]: - """Adapts the ArXiv metadata object into a list of CategoryMembership records""" + def _build_membership_records(self) -> Iterable[dict]: + """Builds an ArXiv category membership dictionary out of the ArXiv metadata object""" schema = CategoryMembershipSchema() diff --git a/src/routines.py b/src/routines.py index 97d14f8..5655850 100644 --- a/src/routines.py +++ b/src/routines.py @@ -120,9 +120,5 @@ def run(self, *args) -> None: for record in records: self.api_controller.create_record( DM_PAPER_METADATA_ROUTE, - { - "paper": record.paper_record, - "authors": record.author_records, - "memberships": record.memberships_records, - }, + record.paper_metadata, )