Merge pull request #59 from dialect-map/dm-io-0.5.0

dialect-map · Aug 26, 2022 · 5e6075b · 5e6075b
2 parents a3f3e38 + 58a9620
commit 5e6075b
Show file tree

Hide file tree

Showing 12 changed files with 132 additions and 88 deletions.
diff --git a/README.md b/README.md
@@ -65,12 +65,12 @@ This command starts a process that recursively traverses a file system tree of P
 sending their metadata to the Dialect Map _private_ API along the way. The process assumes
 that each PDF is an ArXiv paper, with their names as their IDs.
 
-| ARGUMENT            | ENV VARIABLE          | REQUIRED | DESCRIPTION                              |
-|---------------------|-----------------------|----------|------------------------------------------|
-| --input-files-path  | -                     | Yes      | Path to the list of input PDF files      |
-| --meta-file-path    | -                     | Yes      | Path to the ArXiv metadata JSON file     |
-| --gcp-key-path      | -                     | Yes      | GCP Service account key path             |
-| --api-url           | -                     | Yes      | Private API base URL                     |
+| ARGUMENT              | ENV VARIABLE        | REQUIRED | DESCRIPTION                              |
+|-----------------------|---------------------|----------|------------------------------------------|
+| --input-files-path    | -                   | Yes      | Path to the list of input PDF files      |
+| --input-metadata-uris | -                   | Yes      | URIs to the paper metadata sources       |
+| --gcp-key-path        | -                   | Yes      | GCP Service account key path             |
+| --output-api-url      | -                   | Yes      | Private API base URL                     |
 
 
 [ci-status-badge]: https://github.com/dialect-map/dialect-map-job-text/actions/workflows/ci.yml/badge.svg?branch=main

diff --git a/reqs/requirements-prod.txt b/reqs/requirements-prod.txt
@@ -4,5 +4,5 @@ pytz==2021.3
 feedparser==6.0.8
 
 # Private packages
-git+ssh://[email protected]/dialect-map/dialect-map-io.git@v0.4.0#egg=dialect-map-io[gcp]
+git+ssh://[email protected]/dialect-map/dialect-map-io.git@v0.5.1#egg=dialect-map-io[gcp]
 git+ssh://[email protected]/dialect-map/[email protected]#egg=dialect-map-schemas
diff --git a/src/job/input/__init__.py b/src/job/input/__init__.py
@@ -2,3 +2,5 @@
 
 from .content import *
 from .metadata import *
+
+from .helpers import init_source_cls
diff --git a/src/job/input/content/corpus.py b/src/job/input/content/corpus.py
@@ -1,18 +1,18 @@
 # -*- coding: utf-8 -*-
 
-from dialect_map_io import PDFTextParser
+from dialect_map_io import PDFFileHandler
 
 
 class PDFCorpusSource:
     """File corpus source for the PDFs content"""
 
-    def __init__(self, parser: PDFTextParser):
+    def __init__(self, handler: PDFFileHandler):
         """
         Initializes the corpus operator with a given parser
-        :param parser: object to parse the PDF files
+        :param handler: object to handle PDF files
         """
 
-        self.parser = parser
+        self.handler = handler
 
     def extract_txt(self, file_path: str) -> str:
         """
@@ -21,4 +21,4 @@ def extract_txt(self, file_path: str) -> str:
         :return: file text
         """
 
-        return self.parser.parse_file(file_path)
+        return self.handler.read_file(file_path)
diff --git a/src/job/input/helpers.py b/src/job/input/helpers.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+
+from urllib.request import Request as URI
+
+from dialect_map_io import BaseHandler
+from dialect_map_io import ArxivAPIHandler
+from dialect_map_io import JSONFileHandler
+
+from .metadata import *
+from ..parsers import FeedMetadataParser
+from ..parsers import JSONMetadataParser
+
+
+SOURCE_TYPE_API = "api"
+SOURCE_TYPE_FILE = "file"
+
+SOURCE_TYPE_MAPPINGS = {
+    SOURCE_TYPE_API: {
+        "handler_cls": ArxivAPIHandler,
+        "parser_cls": FeedMetadataParser,
+        "source_cls": ArxivMetadataSource,
+    },
+    SOURCE_TYPE_FILE: {
+        "handler_cls": JSONFileHandler,
+        "parser_cls": JSONMetadataParser,
+        "source_cls": JSONMetadataSource,
+    },
+}
+
+
+def init_source_cls(uri: URI, handler: BaseHandler) -> BaseMetadataSource:
+    """
+    Returns a source class depending on the provided URI
+    :param uri: URI to get the source class from
+    :param handler: handler to get the metadata from
+    :return: source class
+    """
+
+    if uri.type in {"file"}:
+        classes = SOURCE_TYPE_MAPPINGS[SOURCE_TYPE_FILE]
+        kwargs = {"file_path": uri.selector}
+    elif uri.type in {"http", "https"}:
+        classes = SOURCE_TYPE_MAPPINGS[SOURCE_TYPE_API]
+        kwargs = {}
+    else:
+        raise ValueError("Source not specified for the provided URI")
+
+    handler_cls = classes["handler_cls"]
+    parser_cls = classes["parser_cls"]
+    source_cls = classes["source_cls"]
+
+    assert isinstance(handler, handler_cls)
+    return source_cls(handler=handler, parser=parser_cls(), **kwargs)
diff --git a/src/job/input/metadata/__init__.py b/src/job/input/metadata/__init__.py
@@ -2,5 +2,5 @@
 
 from .base import BaseMetadataSource
 
-from .api import ApiMetadataSource
-from .file import FileMetadataSource
+from .api import ArxivMetadataSource
+from .file import JSONMetadataSource
diff --git a/src/job/input/metadata/api.py b/src/job/input/metadata/api.py
@@ -3,7 +3,7 @@
 import logging
 
 from typing import List
-from dialect_map_io import ArxivInputAPI
+from dialect_map_io import ArxivAPIHandler
 
 from .base import BaseMetadataSource
 from ...models import ArxivMetadata
@@ -12,17 +12,17 @@
 logger = logging.getLogger()
 
 
-class ApiMetadataSource(BaseMetadataSource):
+class ArxivMetadataSource(BaseMetadataSource):
     """ArXiv API source for the metadata information"""
 
-    def __init__(self, api: ArxivInputAPI, parser: FeedMetadataParser):
+    def __init__(self, handler: ArxivAPIHandler, parser: FeedMetadataParser):
         """
         Initializes the metadata operator with a given API and parser
-        :param api: object to retrieve the ArXiv metadata feed
+        :param handler: object to retrieve the ArXiv metadata feed
         :param parser: object to parse the ArXiv metadata feed
         """
 
-        self.api = api
+        self.handler = handler
         self.parser = parser
 
     def get_metadata(self, paper_id: str) -> List[ArxivMetadata]:
@@ -35,7 +35,7 @@ def get_metadata(self, paper_id: str) -> List[ArxivMetadata]:
         meta = []
 
         try:
-            feed = self.api.request_paper(paper_id)
+            feed = self.handler.request_metadata(paper_id)
         except ConnectionError:
             logger.error(f"Paper {paper_id} not found in the ArXiv export API")
         else:

diff --git a/src/job/input/metadata/file.py b/src/job/input/metadata/file.py
@@ -3,7 +3,7 @@
 import logging
 
 from typing import List
-from dialect_map_io import LocalDataFile
+from dialect_map_io import JSONFileHandler
 
 from .base import BaseMetadataSource
 from ...models import ArxivMetadata
@@ -12,28 +12,29 @@
 logger = logging.getLogger()
 
 
-class FileMetadataSource(BaseMetadataSource):
+class JSONMetadataSource(BaseMetadataSource):
     """JSON file source for the metadata information"""
 
-    def __init__(self, file: LocalDataFile, parser: JSONMetadataParser):
+    def __init__(self, handler: JSONFileHandler, parser: JSONMetadataParser, file_path: str):
         """
         Initializes the metadata operator with a given JSON parser
-        :param file: local data file with the metadata to iterate on
+        :param handler: local data file with the metadata to iterate on
         :param parser: object to parse the metadata entries
+        :param file_path: path to the metadata file
         """
 
+        self.handler = handler
         self.parser = parser
-        self.entries = self._build_metadata_file(file)
+        self.entries = self._build_metadata_file(file_path)
 
-    @staticmethod
-    def _build_metadata_file(file: LocalDataFile) -> dict:
+    def _build_metadata_file(self, file_path: str) -> dict:
         """
-        Builds a ID - JSON dictionary after iterating the provided metadata file
-        :param file: local data file with the metadata to iterate on
+        Builds an ID - JSON dictionary after iterating the provided metadata file
+        :param file_path: path to the metadata file
         :return: ID - JSON dictionary
         """
 
-        return {json["id"]: json for json in file.iter_items()}
+        return {json["id"]: json for json in self.handler.read_items(file_path)}
 
     def get_metadata(self, paper_id: str) -> List[ArxivMetadata]:
         """

diff --git a/src/job/output/api.py b/src/job/output/api.py
@@ -2,7 +2,7 @@
 
 import logging
 
-from dialect_map_io import RestOutputAPI
+from dialect_map_io import DialectMapAPIHandler
 from dialect_map_schemas import APIRoute
 
 logger = logging.getLogger()
@@ -11,13 +11,13 @@
 class DialectMapOperator:
     """Class to operate on the Dialect map API"""
 
-    def __init__(self, api_object: RestOutputAPI):
+    def __init__(self, api_handler: DialectMapAPIHandler):
         """
         Initializes the Dialect map API operator object
-        :param api_object: Dialect map API instantiated object
+        :param api_handler: Dialect map API instantiated object
         """
 
-        self.api_object = api_object
+        self.api_handler = api_handler
 
     def _create(self, api_path: str, record: dict) -> None:
         """
@@ -27,7 +27,7 @@ def _create(self, api_path: str, record: dict) -> None:
         """
 
         try:
-            self.api_object.create_record(api_path, record)
+            self.api_handler.create_record(api_path, record)
         except Exception as error:
             logger.error(f"Cannot create record: {record}")
             logger.error(f"Error: {error}")
@@ -41,7 +41,7 @@ def _archive(self, api_path: str, record_id: str) -> None:
         """
 
         try:
-            self.api_object.archive_record(f"{api_path}/{record_id}")
+            self.api_handler.archive_record(f"{api_path}/{record_id}")
         except Exception as error:
             logger.error(f"Cannot archive record with ID: {record_id}")
             logger.error(f"Error: {error}")

diff --git a/src/job/output/files.py b/src/job/output/files.py
@@ -3,23 +3,23 @@
 import logging
 
 from pathlib import Path
-from dialect_map_io import BaseFileWriter
+from dialect_map_io import BaseFileHandler
 
 logger = logging.getLogger()
 
 
 class LocalFileOperator:
     """Class to write on local file system files"""
 
-    def __init__(self, destination: str, file_writer: BaseFileWriter):
+    def __init__(self, destination: str, file_handler: BaseFileHandler):
         """
         Initializes the local file system operator object
         :param destination: folder to create the files
-        :param file_writer: writer to dump the files content
+        :param file_handler: file to dump the content
         """
 
         self.destination = destination
-        self.file_writer = file_writer
+        self.file_handler = file_handler
 
     def _build_path(self, file_name: str) -> Path:
         """
@@ -43,7 +43,7 @@ def write_text(self, file_name: str, text: str) -> None:
             logging.warning(f"File {file_path} already exists")
             return
 
-        self.file_writer.write_file(
+        self.file_handler.write_file(
             file_path=str(file_path),
             content=text,
         )
diff --git a/src/main.py b/src/main.py
@@ -6,19 +6,12 @@
 from click import Path
 
 from dialect_map_gcp.auth import OpenIDAuthenticator
-from dialect_map_io.data_input import ArxivInputAPI
-from dialect_map_io.data_input import LocalDataFile
-from dialect_map_io.data_output import RestOutputAPI
-from dialect_map_io.parsers import JSONDataParser
-from dialect_map_io.parsers import PDFTextParser
+from dialect_map_io.handlers import DialectMapAPIHandler
+from dialect_map_io.handlers import PDFFileHandler
 
 from job.files import FileSystemIterator
-from job.input import ApiMetadataSource
-from job.input import FileMetadataSource
 from job.input import PDFCorpusSource
 from job.output import DialectMapOperator
-from job.parsers import FeedMetadataParser
-from job.parsers import JSONMetadataParser
 from logs import setup_logger
 from routines import LocalTextRoutine
 from routines import MetadataRoutine
@@ -71,10 +64,10 @@ def text_job(input_files_path: str, output_files_path: str):
     files_iterator = FileSystemIterator(input_files_path, ".pdf")
 
     # Initialize PDF reader
-    pdf_parser = PDFTextParser()
-    pdf_reader = PDFCorpusSource(pdf_parser)
+    pdf_handler = PDFFileHandler()
+    pdf_source = PDFCorpusSource(pdf_handler)
 
-    routine = LocalTextRoutine(files_iterator, pdf_reader)
+    routine = LocalTextRoutine(files_iterator, pdf_source)
     routine.run(output_files_path)
 
 
@@ -90,14 +83,12 @@ def text_job(input_files_path: str, output_files_path: str):
     ),
 )
 @click.option(
-    "--metadata-file-path",
-    help="JSON metadata file local path",
+    "--input-metadata-uris",
+    help="URIs to the paper metadata sources",
+    default=["https://export.arxiv.org/api"],
     required=False,
-    type=Path(
-        exists=True,
-        file_okay=True,
-        dir_okay=False,
-    ),
+    multiple=True,
+    type=str,
 )
 @click.option(
     "--gcp-key-path",
@@ -110,40 +101,30 @@ def text_job(input_files_path: str, output_files_path: str):
     ),
 )
 @click.option(
-    "--api-url",
+    "--output-api-url",
     help="Private API base URL",
     required=True,
     type=str,
 )
 def metadata_job(
     input_files_path: str,
-    metadata_file_path: str,
+    input_metadata_uris: list,
     gcp_key_path: str,
-    api_url: str,
+    output_api_url: str,
 ):
     """Iterates on all PDF papers and send their metadata to the specified API"""
 
     # Initialize file iterator
     file_iter = FileSystemIterator(input_files_path, ".pdf")
 
     # Initialize API controller
-    api_auth = OpenIDAuthenticator(gcp_key_path, api_url)
-    api_conn = RestOutputAPI(api_url, api_auth)
+    api_auth = OpenIDAuthenticator(gcp_key_path, target_url=output_api_url)
+    api_conn = DialectMapAPIHandler(api_auth, base_url=output_api_url)
     api_ctl = DialectMapOperator(api_conn)
 
-    # Initialize metadata sources
-    file_source = FileMetadataSource(
-        LocalDataFile(metadata_file_path, JSONDataParser()),
-        JSONMetadataParser(),
-    )
-    api_source = ApiMetadataSource(
-        ArxivInputAPI("https://export.arxiv.org/api"),
-        FeedMetadataParser(),
-    )
-
+    # Initialize and run routine
     routine = MetadataRoutine(file_iter, api_ctl)
-    routine.add_source(file_source)
-    routine.add_source(api_source)
+    routine.add_sources(input_metadata_uris)
     routine.run()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,5 @@

		from .content import *
		from .metadata import *

		from .helpers import init_source_cls