From f45a5450886e36b9cd1e403b0144b4a423335578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Fri, 26 Aug 2022 10:26:45 +0200 Subject: [PATCH 1/7] Bump up dialect-map IO version to 0.5.1 --- reqs/requirements-prod.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reqs/requirements-prod.txt b/reqs/requirements-prod.txt index bc9b718..fbbaea1 100644 --- a/reqs/requirements-prod.txt +++ b/reqs/requirements-prod.txt @@ -4,5 +4,5 @@ pytz==2021.3 feedparser==6.0.8 # Private packages -git+ssh://git@github.com/dialect-map/dialect-map-io.git@v0.4.0#egg=dialect-map-io[gcp] +git+ssh://git@github.com/dialect-map/dialect-map-io.git@v0.5.1#egg=dialect-map-io[gcp] git+ssh://git@github.com/dialect-map/dialect-map-schemas.git@v0.2.0#egg=dialect-map-schemas From 56d4a9b429700e892a3f1dda99d4a828b77e4044 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Fri, 26 Aug 2022 10:28:23 +0200 Subject: [PATCH 2/7] Adapt input/content pkg classes --- src/job/input/content/corpus.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/job/input/content/corpus.py b/src/job/input/content/corpus.py index b957b9d..d8ea73a 100644 --- a/src/job/input/content/corpus.py +++ b/src/job/input/content/corpus.py @@ -1,18 +1,18 @@ # -*- coding: utf-8 -*- -from dialect_map_io import PDFTextParser +from dialect_map_io import PDFFileHandler class PDFCorpusSource: """File corpus source for the PDFs content""" - def __init__(self, parser: PDFTextParser): + def __init__(self, handler: PDFFileHandler): """ Initializes the corpus operator with a given parser - :param parser: object to parse the PDF files + :param handler: object to handle PDF files """ - self.parser = parser + self.handler = handler def extract_txt(self, file_path: str) -> str: """ @@ -21,4 +21,4 @@ def extract_txt(self, file_path: str) -> str: :return: file text """ - return self.parser.parse_file(file_path) + return self.handler.read_file(file_path) From dcd1f886b6fd021e030fa06022f881fa36dfb789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Fri, 26 Aug 2022 10:28:36 +0200 Subject: [PATCH 3/7] Adapt input/metadata pkg classes --- src/job/input/metadata/__init__.py | 4 ++-- src/job/input/metadata/api.py | 12 ++++++------ src/job/input/metadata/file.py | 21 +++++++++++---------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/job/input/metadata/__init__.py b/src/job/input/metadata/__init__.py index 5e7d50b..e3e1b9d 100644 --- a/src/job/input/metadata/__init__.py +++ b/src/job/input/metadata/__init__.py @@ -2,5 +2,5 @@ from .base import BaseMetadataSource -from .api import ApiMetadataSource -from .file import FileMetadataSource +from .api import ArxivMetadataSource +from .file import JSONMetadataSource diff --git a/src/job/input/metadata/api.py b/src/job/input/metadata/api.py index ec8cf0d..3ae72e1 100644 --- a/src/job/input/metadata/api.py +++ b/src/job/input/metadata/api.py @@ -3,7 +3,7 @@ import logging from typing import List -from dialect_map_io import ArxivInputAPI +from dialect_map_io import ArxivAPIHandler from .base import BaseMetadataSource from ...models import ArxivMetadata @@ -12,17 +12,17 @@ logger = logging.getLogger() -class ApiMetadataSource(BaseMetadataSource): +class ArxivMetadataSource(BaseMetadataSource): """ArXiv API source for the metadata information""" - def __init__(self, api: ArxivInputAPI, parser: FeedMetadataParser): + def __init__(self, handler: ArxivAPIHandler, parser: FeedMetadataParser): """ Initializes the metadata operator with a given API and parser - :param api: object to retrieve the ArXiv metadata feed + :param handler: object to retrieve the ArXiv metadata feed :param parser: object to parse the ArXiv metadata feed """ - self.api = api + self.handler = handler self.parser = parser def get_metadata(self, paper_id: str) -> List[ArxivMetadata]: @@ -35,7 +35,7 @@ def get_metadata(self, paper_id: str) -> List[ArxivMetadata]: meta = [] try: - feed = self.api.request_paper(paper_id) + feed = self.handler.request_metadata(paper_id) except ConnectionError: logger.error(f"Paper {paper_id} not found in the ArXiv export API") else: diff --git a/src/job/input/metadata/file.py b/src/job/input/metadata/file.py index cfa7f02..b086663 100644 --- a/src/job/input/metadata/file.py +++ b/src/job/input/metadata/file.py @@ -3,7 +3,7 @@ import logging from typing import List -from dialect_map_io import LocalDataFile +from dialect_map_io import JSONFileHandler from .base import BaseMetadataSource from ...models import ArxivMetadata @@ -12,28 +12,29 @@ logger = logging.getLogger() -class FileMetadataSource(BaseMetadataSource): +class JSONMetadataSource(BaseMetadataSource): """JSON file source for the metadata information""" - def __init__(self, file: LocalDataFile, parser: JSONMetadataParser): + def __init__(self, handler: JSONFileHandler, parser: JSONMetadataParser, file_path: str): """ Initializes the metadata operator with a given JSON parser - :param file: local data file with the metadata to iterate on + :param handler: local data file with the metadata to iterate on :param parser: object to parse the metadata entries + :param file_path: path to the metadata file """ + self.handler = handler self.parser = parser - self.entries = self._build_metadata_file(file) + self.entries = self._build_metadata_file(file_path) - @staticmethod - def _build_metadata_file(file: LocalDataFile) -> dict: + def _build_metadata_file(self, file_path: str) -> dict: """ - Builds a ID - JSON dictionary after iterating the provided metadata file - :param file: local data file with the metadata to iterate on + Builds an ID - JSON dictionary after iterating the provided metadata file + :param file_path: path to the metadata file :return: ID - JSON dictionary """ - return {json["id"]: json for json in file.iter_items()} + return {json["id"]: json for json in self.handler.read_items(file_path)} def get_metadata(self, paper_id: str) -> List[ArxivMetadata]: """ From 4702944c52a749fc89f5c914f07d04170f2eab25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Fri, 26 Aug 2022 10:29:20 +0200 Subject: [PATCH 4/7] Adapt output pkg classes --- src/job/output/api.py | 12 ++++++------ src/job/output/files.py | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/job/output/api.py b/src/job/output/api.py index 81bf152..55f2c39 100644 --- a/src/job/output/api.py +++ b/src/job/output/api.py @@ -2,7 +2,7 @@ import logging -from dialect_map_io import RestOutputAPI +from dialect_map_io import DialectMapAPIHandler from dialect_map_schemas import APIRoute logger = logging.getLogger() @@ -11,13 +11,13 @@ class DialectMapOperator: """Class to operate on the Dialect map API""" - def __init__(self, api_object: RestOutputAPI): + def __init__(self, api_handler: DialectMapAPIHandler): """ Initializes the Dialect map API operator object - :param api_object: Dialect map API instantiated object + :param api_handler: Dialect map API instantiated object """ - self.api_object = api_object + self.api_handler = api_handler def _create(self, api_path: str, record: dict) -> None: """ @@ -27,7 +27,7 @@ def _create(self, api_path: str, record: dict) -> None: """ try: - self.api_object.create_record(api_path, record) + self.api_handler.create_record(api_path, record) except Exception as error: logger.error(f"Cannot create record: {record}") logger.error(f"Error: {error}") @@ -41,7 +41,7 @@ def _archive(self, api_path: str, record_id: str) -> None: """ try: - self.api_object.archive_record(f"{api_path}/{record_id}") + self.api_handler.archive_record(f"{api_path}/{record_id}") except Exception as error: logger.error(f"Cannot archive record with ID: {record_id}") logger.error(f"Error: {error}") diff --git a/src/job/output/files.py b/src/job/output/files.py index 4d775e6..52a26ae 100644 --- a/src/job/output/files.py +++ b/src/job/output/files.py @@ -3,7 +3,7 @@ import logging from pathlib import Path -from dialect_map_io import BaseFileWriter +from dialect_map_io import BaseFileHandler logger = logging.getLogger() @@ -11,15 +11,15 @@ class LocalFileOperator: """Class to write on local file system files""" - def __init__(self, destination: str, file_writer: BaseFileWriter): + def __init__(self, destination: str, file_handler: BaseFileHandler): """ Initializes the local file system operator object :param destination: folder to create the files - :param file_writer: writer to dump the files content + :param file_handler: file to dump the content """ self.destination = destination - self.file_writer = file_writer + self.file_handler = file_handler def _build_path(self, file_name: str) -> Path: """ @@ -43,7 +43,7 @@ def write_text(self, file_name: str, text: str) -> None: logging.warning(f"File {file_path} already exists") return - self.file_writer.write_file( + self.file_handler.write_file( file_path=str(file_path), content=text, ) From 12ed58894667551cc680bcbc2c14e75c66e9ffda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Fri, 26 Aug 2022 10:29:44 +0200 Subject: [PATCH 5/7] Define input pkg helper functions --- src/job/input/__init__.py | 2 ++ src/job/input/helpers.py | 53 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 src/job/input/helpers.py diff --git a/src/job/input/__init__.py b/src/job/input/__init__.py index 1bd2576..8bd6ec4 100644 --- a/src/job/input/__init__.py +++ b/src/job/input/__init__.py @@ -2,3 +2,5 @@ from .content import * from .metadata import * + +from .helpers import init_source_cls diff --git a/src/job/input/helpers.py b/src/job/input/helpers.py new file mode 100644 index 0000000..26b660e --- /dev/null +++ b/src/job/input/helpers.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +from urllib.request import Request as URI + +from dialect_map_io import BaseHandler +from dialect_map_io import ArxivAPIHandler +from dialect_map_io import JSONFileHandler + +from .metadata import * +from ..parsers import FeedMetadataParser +from ..parsers import JSONMetadataParser + + +SOURCE_TYPE_API = "api" +SOURCE_TYPE_FILE = "file" + +SOURCE_TYPE_MAPPINGS = { + SOURCE_TYPE_API: { + "handler_cls": ArxivAPIHandler, + "parser_cls": FeedMetadataParser, + "source_cls": ArxivMetadataSource, + }, + SOURCE_TYPE_FILE: { + "handler_cls": JSONFileHandler, + "parser_cls": JSONMetadataParser, + "source_cls": JSONMetadataSource, + }, +} + + +def init_source_cls(uri: URI, handler: BaseHandler) -> BaseMetadataSource: + """ + Returns a source class depending on the provided URI + :param uri: URI to get the source class from + :param handler: handler to get the metadata from + :return: source class + """ + + if uri.type in {"file"}: + classes = SOURCE_TYPE_MAPPINGS[SOURCE_TYPE_FILE] + kwargs = {"file_path": uri.selector} + elif uri.type in {"http", "https"}: + classes = SOURCE_TYPE_MAPPINGS[SOURCE_TYPE_API] + kwargs = {} + else: + raise ValueError("Source not specified for the provided URI") + + handler_cls = classes["handler_cls"] + parser_cls = classes["parser_cls"] + source_cls = classes["source_cls"] + + assert isinstance(handler, handler_cls) + return source_cls(handler=handler, parser=parser_cls(), **kwargs) From 09d79fbb6e3f5d02f0dee4947b2deb6d389f86d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Fri, 26 Aug 2022 10:30:52 +0200 Subject: [PATCH 6/7] Update main.py and routines.py modules --- src/main.py | 53 ++++++++++++++++--------------------------------- src/routines.py | 29 +++++++++++++++++---------- 2 files changed, 35 insertions(+), 47 deletions(-) diff --git a/src/main.py b/src/main.py index 19feb37..b0b040d 100644 --- a/src/main.py +++ b/src/main.py @@ -6,19 +6,12 @@ from click import Path from dialect_map_gcp.auth import OpenIDAuthenticator -from dialect_map_io.data_input import ArxivInputAPI -from dialect_map_io.data_input import LocalDataFile -from dialect_map_io.data_output import RestOutputAPI -from dialect_map_io.parsers import JSONDataParser -from dialect_map_io.parsers import PDFTextParser +from dialect_map_io.handlers import DialectMapAPIHandler +from dialect_map_io.handlers import PDFFileHandler from job.files import FileSystemIterator -from job.input import ApiMetadataSource -from job.input import FileMetadataSource from job.input import PDFCorpusSource from job.output import DialectMapOperator -from job.parsers import FeedMetadataParser -from job.parsers import JSONMetadataParser from logs import setup_logger from routines import LocalTextRoutine from routines import MetadataRoutine @@ -71,10 +64,10 @@ def text_job(input_files_path: str, output_files_path: str): files_iterator = FileSystemIterator(input_files_path, ".pdf") # Initialize PDF reader - pdf_parser = PDFTextParser() - pdf_reader = PDFCorpusSource(pdf_parser) + pdf_handler = PDFFileHandler() + pdf_source = PDFCorpusSource(pdf_handler) - routine = LocalTextRoutine(files_iterator, pdf_reader) + routine = LocalTextRoutine(files_iterator, pdf_source) routine.run(output_files_path) @@ -90,14 +83,12 @@ def text_job(input_files_path: str, output_files_path: str): ), ) @click.option( - "--metadata-file-path", - help="JSON metadata file local path", + "--input-metadata-uris", + help="URIs to the paper metadata sources", + default=["https://export.arxiv.org/api"], required=False, - type=Path( - exists=True, - file_okay=True, - dir_okay=False, - ), + multiple=True, + type=str, ) @click.option( "--gcp-key-path", @@ -110,16 +101,16 @@ def text_job(input_files_path: str, output_files_path: str): ), ) @click.option( - "--api-url", + "--output-api-url", help="Private API base URL", required=True, type=str, ) def metadata_job( input_files_path: str, - metadata_file_path: str, + input_metadata_uris: list, gcp_key_path: str, - api_url: str, + output_api_url: str, ): """Iterates on all PDF papers and send their metadata to the specified API""" @@ -127,23 +118,13 @@ def metadata_job( file_iter = FileSystemIterator(input_files_path, ".pdf") # Initialize API controller - api_auth = OpenIDAuthenticator(gcp_key_path, api_url) - api_conn = RestOutputAPI(api_url, api_auth) + api_auth = OpenIDAuthenticator(gcp_key_path, target_url=output_api_url) + api_conn = DialectMapAPIHandler(api_auth, base_url=output_api_url) api_ctl = DialectMapOperator(api_conn) - # Initialize metadata sources - file_source = FileMetadataSource( - LocalDataFile(metadata_file_path, JSONDataParser()), - JSONMetadataParser(), - ) - api_source = ApiMetadataSource( - ArxivInputAPI("https://export.arxiv.org/api"), - FeedMetadataParser(), - ) - + # Initialize and run routine routine = MetadataRoutine(file_iter, api_ctl) - routine.add_source(file_source) - routine.add_source(api_source) + routine.add_sources(input_metadata_uris) routine.run() diff --git a/src/routines.py b/src/routines.py index 5655850..dd0a1a8 100644 --- a/src/routines.py +++ b/src/routines.py @@ -5,13 +5,15 @@ from abc import ABC from abc import abstractmethod from typing import List +from urllib.request import Request as URI -from dialect_map_io.data_output import TextFileWriter +from dialect_map_io.handlers import TextFileHandler +from dialect_map_io.handlers import init_handler_cls from dialect_map_schemas.routes import DM_PAPER_METADATA_ROUTE from job.files import FileSystemIterator -from job.input import BaseMetadataSource from job.input import PDFCorpusSource +from job.input import init_source_cls from job.models import ArxivMetadata from job.output import DialectMapOperator from job.output import LocalFileOperator @@ -35,15 +37,15 @@ def run(self, destination_path: str) -> None: class LocalTextRoutine(BaseRoutine): """Routine extracting local ArXiv corpus texts""" - def __init__(self, file_iter: FileSystemIterator, pdf_reader: PDFCorpusSource): + def __init__(self, file_iter: FileSystemIterator, pdf_source: PDFCorpusSource): """ Initializes the local ArXiv corpus text extraction routine :param file_iter: Local file system iterator - :param pdf_reader: PDF file corpus source + :param pdf_source: PDF file corpus source """ self.file_iter = file_iter - self.pdf_reader = pdf_reader + self.pdf_source = pdf_source def run(self, destination_path: str) -> None: """ @@ -57,11 +59,11 @@ def run(self, destination_path: str) -> None: output_path = f"{destination_path}/{path_diff}" # Initialize TXT file writer - txt_writer = TextFileWriter() - txt_operator = LocalFileOperator(output_path, txt_writer) + txt_handler = TextFileHandler() + txt_operator = LocalFileOperator(output_path, txt_handler) # Save paper contents - txt_content = self.pdf_reader.extract_txt(file_path) + txt_content = self.pdf_source.extract_txt(file_path) txt_operator.write_text(file_name, txt_content) @@ -95,13 +97,18 @@ def _get_metadata_records(self, paper_id: str) -> List[ArxivMetadata]: return metadata_records - def add_source(self, source: BaseMetadataSource) -> None: + def add_sources(self, metadata_uris: List[str]) -> None: """ Adds an ArXiv metadata source to the list of sources - :param source: metadata source to extract ArXiv metadata from + :param metadata_uris: URIs to extract ArXiv metadata from """ - self.sources.append(source) + for uri in metadata_uris: + uri_obj = URI(uri) + handler = init_handler_cls(uri_obj) + source = init_source_cls(uri_obj, handler) + + self.sources.append(source) def run(self, *args) -> None: """ From 58a962097a1e6c84001e42692daa46a7fa965de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Fri, 26 Aug 2022 10:31:19 +0200 Subject: [PATCH 7/7] Update metadata job documentation --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 71b3450..368a23d 100644 --- a/README.md +++ b/README.md @@ -65,12 +65,12 @@ This command starts a process that recursively traverses a file system tree of P sending their metadata to the Dialect Map _private_ API along the way. The process assumes that each PDF is an ArXiv paper, with their names as their IDs. -| ARGUMENT | ENV VARIABLE | REQUIRED | DESCRIPTION | -|---------------------|-----------------------|----------|------------------------------------------| -| --input-files-path | - | Yes | Path to the list of input PDF files | -| --meta-file-path | - | Yes | Path to the ArXiv metadata JSON file | -| --gcp-key-path | - | Yes | GCP Service account key path | -| --api-url | - | Yes | Private API base URL | +| ARGUMENT | ENV VARIABLE | REQUIRED | DESCRIPTION | +|-----------------------|---------------------|----------|------------------------------------------| +| --input-files-path | - | Yes | Path to the list of input PDF files | +| --input-metadata-uris | - | Yes | URIs to the paper metadata sources | +| --gcp-key-path | - | Yes | GCP Service account key path | +| --output-api-url | - | Yes | Private API base URL | [ci-status-badge]: https://github.com/dialect-map/dialect-map-job-text/actions/workflows/ci.yml/badge.svg?branch=main