From fdc6892dbe007a530700e028621f0ee59e5010f9 Mon Sep 17 00:00:00 2001 From: yshalenyk Date: Thu, 8 Aug 2024 22:08:13 +0300 Subject: [PATCH] feat: map codelists value from codelists mapping template file --- nightingale/cli.py | 58 +++++++++++--------- nightingale/config.py | 1 + nightingale/mapper.py | 27 +++++++-- nightingale/mapping_template/v09/__init__.py | 3 +- 4 files changed, 56 insertions(+), 33 deletions(-) diff --git a/nightingale/cli.py b/nightingale/cli.py index d21225f..36598cc 100644 --- a/nightingale/cli.py +++ b/nightingale/cli.py @@ -58,6 +58,7 @@ def load_config(config_file): ) @click.option("--datasource", type=str, help="Datasource connection string") @click.option("--mapping-file", type=click_pathlib.Path(exists=True), help="Mapping file path") +@click.option("--codelists-file", type=click_pathlib.Path(exists=True), help="Codelists mapping file path") @click.option("--ocid-prefix", type=str, help="OCID prefix") @click.option("--selector", type=click_pathlib.Path(exists=True), help="Path to selector SQL script") @click.option("--force-publish", is_flag=True, help="Force publish") @@ -76,6 +77,7 @@ def run( loglevel, datasource, mapping_file, + codelists_file, ocid_prefix, selector, force_publish, @@ -101,37 +103,39 @@ def run( try: logger.debug(f"Loading configuration from {config_file}") - config_data = load_config(config_file) + config_data = {} + if config_file: + config_data = load_config(config_file) # Apply CLI overrides if datasource: config_data["datasource"] = {"connection": datasource} - if mapping_file or ocid_prefix or selector or force_publish: - selector_content = config_data["mapping"]["selector"] - if selector: - try: - with open(selector, "r") as f: - selector_content = f.read() - except (OSError, IOError) as e: - raise click.ClickException(f"Error reading selector file {selector}: {e}") - config_data["mapping"] = { - "file": mapping_file or config_data["mapping"]["file"], - "ocid_prefix": ocid_prefix or config_data["mapping"]["ocid_prefix"], - "selector": selector_content, - "force_publish": force_publish - if force_publish is not None - else config_data["mapping"].get("force_publish", False), - } - if publisher or base_uri or version or publisher_uid or publisher_scheme or publisher_uri or extensions: - config_data["publishing"] = { - "publisher": publisher or config_data["publishing"]["publisher"], - "base_uri": base_uri or config_data["publishing"]["base_uri"], - "version": version or config_data["publishing"].get("version", ""), - "publisher_uid": publisher_uid or config_data["publishing"].get("publisher_uid", ""), - "publisher_scheme": publisher_scheme or config_data["publishing"].get("publisher_scheme", ""), - "publisher_uri": publisher_uri or config_data["publishing"].get("publisher_uri", ""), - "extensions": list(extensions) if extensions else config_data["publishing"].get("extensions", []), - } + selector_content = config_data["mapping"]["selector"] + if selector: + try: + with open(selector, "r") as f: + selector_content = f.read() + except (OSError, IOError) as e: + raise click.ClickException(f"Error reading selector file {selector}: {e}") + # TODO: simplify this + config_data["mapping"] = { + "file": mapping_file or config_data["mapping"]["file"], + "codelists": codelists_file or config_data["mapping"]["codelists"], + "ocid_prefix": ocid_prefix or config_data["mapping"]["ocid_prefix"], + "selector": selector_content, + "force_publish": force_publish + if force_publish is not None + else config_data["mapping"].get("force_publish", False), + } + config_data["publishing"] = { + "publisher": publisher or config_data["publishing"]["publisher"], + "base_uri": base_uri or config_data["publishing"]["base_uri"], + "version": version or config_data["publishing"].get("version", ""), + "publisher_uid": publisher_uid or config_data["publishing"].get("publisher_uid", ""), + "publisher_scheme": publisher_scheme or config_data["publishing"].get("publisher_scheme", ""), + "publisher_uri": publisher_uri or config_data["publishing"].get("publisher_uri", ""), + "extensions": list(extensions) if extensions else config_data["publishing"].get("extensions", []), + } if output_directory: config_data["output"] = {"directory": output_directory} diff --git a/nightingale/config.py b/nightingale/config.py index c246a55..3596bc9 100644 --- a/nightingale/config.py +++ b/nightingale/config.py @@ -36,6 +36,7 @@ class Mapping: ocid_prefix: str selector: str force_publish: Optional[bool] = False + codelists: Optional[Path] = None @dataclass(frozen=True) diff --git a/nightingale/mapper.py b/nightingale/mapper.py index ab42d29..f46debb 100644 --- a/nightingale/mapper.py +++ b/nightingale/mapper.py @@ -1,8 +1,9 @@ import logging -from typing import Any +from typing import Any, Optional import dict_hash +from .codelists import CodelistsMapping from .config import Config from .mapping_template.v09 import MappingTemplate, MappingTemplateValidator from .utils import get_iso_now, is_new_array, remove_dicts_without_id @@ -50,6 +51,9 @@ def map(self, loader: Any, validate_mapping: bool = False) -> list[dict[str, Any """ config = self.config.mapping mapping = MappingTemplate(config) + codelists = None + if config.codelists: + codelists = CodelistsMapping(config) logger.info("MappingTemplate data loaded") data = loader.load(config.selector) logger.info("Source data is loaded...") @@ -59,9 +63,11 @@ def map(self, loader: Any, validate_mapping: bool = False) -> list[dict[str, Any validator.validate_data_elements() validator.validate_selector(data[0]) logger.info("Start mapping data") - return self.transform_data(data, mapping) + return self.transform_data(data, mapping, codelists=codelists) - def transform_data(self, data: list[dict[Any, Any]], mapping: MappingTemplate) -> list[dict[str, Any]]: + def transform_data( + self, data: list[dict[Any, Any]], mapping: MappingTemplate, codelists: Optional[CodelistsMapping] = None + ) -> list[dict[str, Any]]: """ Transform the input data to the OCDS format. @@ -89,7 +95,7 @@ def transform_data(self, data: list[dict[Any, Any]], mapping: MappingTemplate) - curr_ocid = ocid curr_release = {} - curr_release = self.transform_row(row, mapping, mapping.get_schema(), curr_release) + curr_release = self.transform_row(row, mapping, mapping.get_schema(), curr_release, codelists=codelists) if curr_release: self.finish_release(curr_ocid, curr_release, mapped) @@ -111,6 +117,7 @@ def transform_row( mapping_config: MappingTemplate, flattened_schema: dict[str, Any], result: dict = None, + codelists: Optional[CodelistsMapping] = None, ) -> dict: """ Transform a single row of input data to the OCDS format. @@ -128,6 +135,7 @@ def transform_row( """ def set_nested_value(nested_dict, keys, value, schema, add_new=False): + value = self.map_codelist_value(keys, schema, codelists, value) for i, key in enumerate(keys[:-1]): if isinstance(nested_dict, list): if not nested_dict: @@ -200,7 +208,7 @@ def set_nested_value(nested_dict, keys, value, schema, add_new=False): current[key] = [] if flattened_schema.get(key_path, {}).get("type") == "array" else {} current = current[key] - + value = self.map_codelist_value(keys, flattened_schema, codelists, value) if isinstance(current, list): if not current: current.append({}) @@ -270,3 +278,12 @@ def remove_empty_id_arrays(self, data: Any) -> Any: """ return remove_dicts_without_id(data) + + def map_codelist_value(self, keys, schema, codelists, value): + path = "/" + "/".join(keys) + if codelist := schema.get(path, {}).get("codelist"): + codelist = codelists.get_mapping_for_codelist(codelist) + if codelist: + if new_value := codelist.get(value): + return new_value + return value diff --git a/nightingale/mapping_template/v09/__init__.py b/nightingale/mapping_template/v09/__init__.py index 19928e8..e069705 100644 --- a/nightingale/mapping_template/v09/__init__.py +++ b/nightingale/mapping_template/v09/__init__.py @@ -118,7 +118,7 @@ def read_schema_sheet(self): schema = {} for sheet in sheets: for row in sheet.iter_rows(min_row=2, values_only=True): - _, path, title, description, type, range, values, links, *_ = row + _, path, title, description, type, range, values, links, codelist, *_ = row if not path: continue path = "/" + path @@ -133,6 +133,7 @@ def read_schema_sheet(self): "range": range, "values": values, "links": links, + "codelist": codelist, } return schema