Skip to content
This repository has been archived by the owner on Sep 3, 2024. It is now read-only.

Commit

Permalink
Merge pull request #59 from dialect-map/dm-io-0.5.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Sinclert authored Aug 26, 2022
2 parents a3f3e38 + 58a9620 commit 5e6075b
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 88 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ This command starts a process that recursively traverses a file system tree of P
sending their metadata to the Dialect Map _private_ API along the way. The process assumes
that each PDF is an ArXiv paper, with their names as their IDs.

| ARGUMENT | ENV VARIABLE | REQUIRED | DESCRIPTION |
|---------------------|-----------------------|----------|------------------------------------------|
| --input-files-path | - | Yes | Path to the list of input PDF files |
| --meta-file-path | - | Yes | Path to the ArXiv metadata JSON file |
| --gcp-key-path | - | Yes | GCP Service account key path |
| --api-url | - | Yes | Private API base URL |
| ARGUMENT | ENV VARIABLE | REQUIRED | DESCRIPTION |
|-----------------------|---------------------|----------|------------------------------------------|
| --input-files-path | - | Yes | Path to the list of input PDF files |
| --input-metadata-uris | - | Yes | URIs to the paper metadata sources |
| --gcp-key-path | - | Yes | GCP Service account key path |
| --output-api-url | - | Yes | Private API base URL |


[ci-status-badge]: https://github.com/dialect-map/dialect-map-job-text/actions/workflows/ci.yml/badge.svg?branch=main
Expand Down
2 changes: 1 addition & 1 deletion reqs/requirements-prod.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ pytz==2021.3
feedparser==6.0.8

# Private packages
git+ssh://[email protected]/dialect-map/dialect-map-io.git@v0.4.0#egg=dialect-map-io[gcp]
git+ssh://[email protected]/dialect-map/dialect-map-io.git@v0.5.1#egg=dialect-map-io[gcp]
git+ssh://[email protected]/dialect-map/[email protected]#egg=dialect-map-schemas
2 changes: 2 additions & 0 deletions src/job/input/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@

from .content import *
from .metadata import *

from .helpers import init_source_cls
10 changes: 5 additions & 5 deletions src/job/input/content/corpus.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
# -*- coding: utf-8 -*-

from dialect_map_io import PDFTextParser
from dialect_map_io import PDFFileHandler


class PDFCorpusSource:
"""File corpus source for the PDFs content"""

def __init__(self, parser: PDFTextParser):
def __init__(self, handler: PDFFileHandler):
"""
Initializes the corpus operator with a given parser
:param parser: object to parse the PDF files
:param handler: object to handle PDF files
"""

self.parser = parser
self.handler = handler

def extract_txt(self, file_path: str) -> str:
"""
Expand All @@ -21,4 +21,4 @@ def extract_txt(self, file_path: str) -> str:
:return: file text
"""

return self.parser.parse_file(file_path)
return self.handler.read_file(file_path)
53 changes: 53 additions & 0 deletions src/job/input/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-

from urllib.request import Request as URI

from dialect_map_io import BaseHandler
from dialect_map_io import ArxivAPIHandler
from dialect_map_io import JSONFileHandler

from .metadata import *
from ..parsers import FeedMetadataParser
from ..parsers import JSONMetadataParser


SOURCE_TYPE_API = "api"
SOURCE_TYPE_FILE = "file"

SOURCE_TYPE_MAPPINGS = {
SOURCE_TYPE_API: {
"handler_cls": ArxivAPIHandler,
"parser_cls": FeedMetadataParser,
"source_cls": ArxivMetadataSource,
},
SOURCE_TYPE_FILE: {
"handler_cls": JSONFileHandler,
"parser_cls": JSONMetadataParser,
"source_cls": JSONMetadataSource,
},
}


def init_source_cls(uri: URI, handler: BaseHandler) -> BaseMetadataSource:
"""
Returns a source class depending on the provided URI
:param uri: URI to get the source class from
:param handler: handler to get the metadata from
:return: source class
"""

if uri.type in {"file"}:
classes = SOURCE_TYPE_MAPPINGS[SOURCE_TYPE_FILE]
kwargs = {"file_path": uri.selector}
elif uri.type in {"http", "https"}:
classes = SOURCE_TYPE_MAPPINGS[SOURCE_TYPE_API]
kwargs = {}
else:
raise ValueError("Source not specified for the provided URI")

handler_cls = classes["handler_cls"]
parser_cls = classes["parser_cls"]
source_cls = classes["source_cls"]

assert isinstance(handler, handler_cls)
return source_cls(handler=handler, parser=parser_cls(), **kwargs)
4 changes: 2 additions & 2 deletions src/job/input/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

from .base import BaseMetadataSource

from .api import ApiMetadataSource
from .file import FileMetadataSource
from .api import ArxivMetadataSource
from .file import JSONMetadataSource
12 changes: 6 additions & 6 deletions src/job/input/metadata/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging

from typing import List
from dialect_map_io import ArxivInputAPI
from dialect_map_io import ArxivAPIHandler

from .base import BaseMetadataSource
from ...models import ArxivMetadata
Expand All @@ -12,17 +12,17 @@
logger = logging.getLogger()


class ApiMetadataSource(BaseMetadataSource):
class ArxivMetadataSource(BaseMetadataSource):
"""ArXiv API source for the metadata information"""

def __init__(self, api: ArxivInputAPI, parser: FeedMetadataParser):
def __init__(self, handler: ArxivAPIHandler, parser: FeedMetadataParser):
"""
Initializes the metadata operator with a given API and parser
:param api: object to retrieve the ArXiv metadata feed
:param handler: object to retrieve the ArXiv metadata feed
:param parser: object to parse the ArXiv metadata feed
"""

self.api = api
self.handler = handler
self.parser = parser

def get_metadata(self, paper_id: str) -> List[ArxivMetadata]:
Expand All @@ -35,7 +35,7 @@ def get_metadata(self, paper_id: str) -> List[ArxivMetadata]:
meta = []

try:
feed = self.api.request_paper(paper_id)
feed = self.handler.request_metadata(paper_id)
except ConnectionError:
logger.error(f"Paper {paper_id} not found in the ArXiv export API")
else:
Expand Down
21 changes: 11 additions & 10 deletions src/job/input/metadata/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging

from typing import List
from dialect_map_io import LocalDataFile
from dialect_map_io import JSONFileHandler

from .base import BaseMetadataSource
from ...models import ArxivMetadata
Expand All @@ -12,28 +12,29 @@
logger = logging.getLogger()


class FileMetadataSource(BaseMetadataSource):
class JSONMetadataSource(BaseMetadataSource):
"""JSON file source for the metadata information"""

def __init__(self, file: LocalDataFile, parser: JSONMetadataParser):
def __init__(self, handler: JSONFileHandler, parser: JSONMetadataParser, file_path: str):
"""
Initializes the metadata operator with a given JSON parser
:param file: local data file with the metadata to iterate on
:param handler: local data file with the metadata to iterate on
:param parser: object to parse the metadata entries
:param file_path: path to the metadata file
"""

self.handler = handler
self.parser = parser
self.entries = self._build_metadata_file(file)
self.entries = self._build_metadata_file(file_path)

@staticmethod
def _build_metadata_file(file: LocalDataFile) -> dict:
def _build_metadata_file(self, file_path: str) -> dict:
"""
Builds a ID - JSON dictionary after iterating the provided metadata file
:param file: local data file with the metadata to iterate on
Builds an ID - JSON dictionary after iterating the provided metadata file
:param file_path: path to the metadata file
:return: ID - JSON dictionary
"""

return {json["id"]: json for json in file.iter_items()}
return {json["id"]: json for json in self.handler.read_items(file_path)}

def get_metadata(self, paper_id: str) -> List[ArxivMetadata]:
"""
Expand Down
12 changes: 6 additions & 6 deletions src/job/output/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging

from dialect_map_io import RestOutputAPI
from dialect_map_io import DialectMapAPIHandler
from dialect_map_schemas import APIRoute

logger = logging.getLogger()
Expand All @@ -11,13 +11,13 @@
class DialectMapOperator:
"""Class to operate on the Dialect map API"""

def __init__(self, api_object: RestOutputAPI):
def __init__(self, api_handler: DialectMapAPIHandler):
"""
Initializes the Dialect map API operator object
:param api_object: Dialect map API instantiated object
:param api_handler: Dialect map API instantiated object
"""

self.api_object = api_object
self.api_handler = api_handler

def _create(self, api_path: str, record: dict) -> None:
"""
Expand All @@ -27,7 +27,7 @@ def _create(self, api_path: str, record: dict) -> None:
"""

try:
self.api_object.create_record(api_path, record)
self.api_handler.create_record(api_path, record)
except Exception as error:
logger.error(f"Cannot create record: {record}")
logger.error(f"Error: {error}")
Expand All @@ -41,7 +41,7 @@ def _archive(self, api_path: str, record_id: str) -> None:
"""

try:
self.api_object.archive_record(f"{api_path}/{record_id}")
self.api_handler.archive_record(f"{api_path}/{record_id}")
except Exception as error:
logger.error(f"Cannot archive record with ID: {record_id}")
logger.error(f"Error: {error}")
Expand Down
10 changes: 5 additions & 5 deletions src/job/output/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,23 @@
import logging

from pathlib import Path
from dialect_map_io import BaseFileWriter
from dialect_map_io import BaseFileHandler

logger = logging.getLogger()


class LocalFileOperator:
"""Class to write on local file system files"""

def __init__(self, destination: str, file_writer: BaseFileWriter):
def __init__(self, destination: str, file_handler: BaseFileHandler):
"""
Initializes the local file system operator object
:param destination: folder to create the files
:param file_writer: writer to dump the files content
:param file_handler: file to dump the content
"""

self.destination = destination
self.file_writer = file_writer
self.file_handler = file_handler

def _build_path(self, file_name: str) -> Path:
"""
Expand All @@ -43,7 +43,7 @@ def write_text(self, file_name: str, text: str) -> None:
logging.warning(f"File {file_path} already exists")
return

self.file_writer.write_file(
self.file_handler.write_file(
file_path=str(file_path),
content=text,
)
53 changes: 17 additions & 36 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,12 @@
from click import Path

from dialect_map_gcp.auth import OpenIDAuthenticator
from dialect_map_io.data_input import ArxivInputAPI
from dialect_map_io.data_input import LocalDataFile
from dialect_map_io.data_output import RestOutputAPI
from dialect_map_io.parsers import JSONDataParser
from dialect_map_io.parsers import PDFTextParser
from dialect_map_io.handlers import DialectMapAPIHandler
from dialect_map_io.handlers import PDFFileHandler

from job.files import FileSystemIterator
from job.input import ApiMetadataSource
from job.input import FileMetadataSource
from job.input import PDFCorpusSource
from job.output import DialectMapOperator
from job.parsers import FeedMetadataParser
from job.parsers import JSONMetadataParser
from logs import setup_logger
from routines import LocalTextRoutine
from routines import MetadataRoutine
Expand Down Expand Up @@ -71,10 +64,10 @@ def text_job(input_files_path: str, output_files_path: str):
files_iterator = FileSystemIterator(input_files_path, ".pdf")

# Initialize PDF reader
pdf_parser = PDFTextParser()
pdf_reader = PDFCorpusSource(pdf_parser)
pdf_handler = PDFFileHandler()
pdf_source = PDFCorpusSource(pdf_handler)

routine = LocalTextRoutine(files_iterator, pdf_reader)
routine = LocalTextRoutine(files_iterator, pdf_source)
routine.run(output_files_path)


Expand All @@ -90,14 +83,12 @@ def text_job(input_files_path: str, output_files_path: str):
),
)
@click.option(
"--metadata-file-path",
help="JSON metadata file local path",
"--input-metadata-uris",
help="URIs to the paper metadata sources",
default=["https://export.arxiv.org/api"],
required=False,
type=Path(
exists=True,
file_okay=True,
dir_okay=False,
),
multiple=True,
type=str,
)
@click.option(
"--gcp-key-path",
Expand All @@ -110,40 +101,30 @@ def text_job(input_files_path: str, output_files_path: str):
),
)
@click.option(
"--api-url",
"--output-api-url",
help="Private API base URL",
required=True,
type=str,
)
def metadata_job(
input_files_path: str,
metadata_file_path: str,
input_metadata_uris: list,
gcp_key_path: str,
api_url: str,
output_api_url: str,
):
"""Iterates on all PDF papers and send their metadata to the specified API"""

# Initialize file iterator
file_iter = FileSystemIterator(input_files_path, ".pdf")

# Initialize API controller
api_auth = OpenIDAuthenticator(gcp_key_path, api_url)
api_conn = RestOutputAPI(api_url, api_auth)
api_auth = OpenIDAuthenticator(gcp_key_path, target_url=output_api_url)
api_conn = DialectMapAPIHandler(api_auth, base_url=output_api_url)
api_ctl = DialectMapOperator(api_conn)

# Initialize metadata sources
file_source = FileMetadataSource(
LocalDataFile(metadata_file_path, JSONDataParser()),
JSONMetadataParser(),
)
api_source = ApiMetadataSource(
ArxivInputAPI("https://export.arxiv.org/api"),
FeedMetadataParser(),
)

# Initialize and run routine
routine = MetadataRoutine(file_iter, api_ctl)
routine.add_source(file_source)
routine.add_source(api_source)
routine.add_sources(input_metadata_uris)
routine.run()


Expand Down
Loading

0 comments on commit 5e6075b

Please sign in to comment.