From 553b430d7b88132021a1e169239a176b98f87dd8 Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 00:52:37 +0300 Subject: [PATCH 01/13] updates xpath validator --- requirements.txt | 3 +- ted_sws/core/model/manifestation.py | 6 - ted_sws/core/model/transform.py | 113 +----- .../adapters/mapping_suite_repository.py | 52 ++- .../services/notice_xml_indexer.py | 27 +- .../adapters/conceptual_mapping_reader.py | 294 ---------------- .../adapters/mapping_suite_reader.py | 79 +++++ .../mapping_suite_structure_checker.py | 78 +---- .../services/conceptual_mapping_differ.py | 322 ------------------ .../conceptual_mapping_files_injection.py | 2 +- .../conceptual_mapping_generate_metadata.py | 93 ----- ...ceptual_mapping_generate_sparql_queries.py | 193 ----------- .../services/conceptual_mapping_processor.py | 1 - .../services/conceptual_mapping_reader.py | 18 - .../services/mapping_suite_reader.py | 11 + .../services/notice_eligibility.py | 4 +- .../adapters/validation_summary_runner.py | 1 - .../adapters/xpath_coverage_runner.py | 113 +++--- .../validation_summary_report.jinja2 | 1 - .../templates/xpath_coverage_report.jinja2 | 83 +---- .../services/sparql_test_suite_runner.py | 40 +-- .../conceptual_mapping_differ/conftest.py | 13 - .../test_conceptual_mapping_differ.py | 67 ---- .../test_conceptual_mapping_reader.py | 24 -- ...sor.py => test_mapping_suite_processor.py} | 27 +- .../test_mapping_suite_structure_checker.py | 22 -- .../test_sparql_test_suite_runner.py | 7 +- 27 files changed, 220 insertions(+), 1474 deletions(-) delete mode 100644 ted_sws/mapping_suite_processor/adapters/conceptual_mapping_reader.py create mode 100644 ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py delete mode 100644 ted_sws/mapping_suite_processor/services/conceptual_mapping_differ.py delete mode 100644 ted_sws/mapping_suite_processor/services/conceptual_mapping_generate_metadata.py delete mode 100644 ted_sws/mapping_suite_processor/services/conceptual_mapping_generate_sparql_queries.py delete mode 100644 ted_sws/mapping_suite_processor/services/conceptual_mapping_reader.py create mode 100644 ted_sws/mapping_suite_processor/services/mapping_suite_reader.py delete mode 100644 tests/e2e/mapping_suite_processor/conceptual_mapping_differ/conftest.py delete mode 100644 tests/e2e/mapping_suite_processor/conceptual_mapping_differ/test_conceptual_mapping_differ.py delete mode 100644 tests/unit/mapping_suite_processor/test_conceptual_mapping_reader.py rename tests/unit/mapping_suite_processor/{test_conceptual_mapping_processor.py => test_mapping_suite_processor.py} (52%) diff --git a/requirements.txt b/requirements.txt index 5f7811cd9..83f11f56a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,4 +25,5 @@ json2html~=1.3.0 minio~=7.1.1 certifi~=2022.12.7 shortuuid~=1.0.11 -pendulum~=2.1.2 \ No newline at end of file +pendulum~=2.1.2 +saxonche~=12.4 diff --git a/ted_sws/core/model/manifestation.py b/ted_sws/core/model/manifestation.py index 20897f5d4..cf1216a24 100644 --- a/ted_sws/core/model/manifestation.py +++ b/ted_sws/core/model/manifestation.py @@ -65,7 +65,6 @@ class ValidationManifestation(Manifestation): class XPATHCoverageSummaryResult(PropertyBaseModel): xpath_covered: Optional[int] = 0 - xpath_not_covered: Optional[int] = 0 class XPATHCoverageSummaryReport(PropertyBaseModel): @@ -154,11 +153,6 @@ class XPATHCoverageValidationResultBase(PropertyBaseModel): """ xpath_assertions: Optional[List[XPATHCoverageValidationAssertion]] = [] xpath_covered: Optional[List[str]] = [] - xpath_not_covered: Optional[List[str]] = [] - xpath_extra: Optional[List[str]] = [] - remarked_xpaths: Optional[List[str]] = [] - coverage: Optional[float] - conceptual_coverage: Optional[float] class XPATHCoverageValidationResult(XPATHCoverageValidationResultBase): diff --git a/ted_sws/core/model/transform.py b/ted_sws/core/model/transform.py index 0051d9dcf..5bce49e88 100644 --- a/ted_sws/core/model/transform.py +++ b/ted_sws/core/model/transform.py @@ -96,121 +96,11 @@ class TransformationTestData(MappingSuiteComponent): test_data: List[FileResource] -class ConceptualMappingXPATH(MappingSuiteComponent): +class MappingXPATH(MappingSuiteComponent): xpath: str form_field: Optional[str] -class ConceptualMappingDiffMetadata(MappingSuiteComponent): - """""" - branches: Optional[List[str]] - mapping_suite_ids: Optional[List[str]] - files: Optional[List[Optional[str]]] - defaults: Optional[dict] - metadata: Optional[List[dict]] - - -class ConceptualMappingDiffData(MappingSuiteComponent): - """""" - html: Optional[str] - transformed: Optional[dict] - original: Optional[dict] - - -class ConceptualMappingDiff(MappingSuiteComponent): - """""" - created_at: str = datetime.now().isoformat() - metadata: Optional[ConceptualMappingDiffMetadata] - data: Optional[ConceptualMappingDiffData] - - -class ConceptualMappingMetadataConstraints(PropertyBaseModel): - """ - This class contains Mapping Suite Conceptual Mapping Metadata Constraints Object model structure - """ - eforms_subtype: Optional[List[str]] - start_date: Optional[str] - end_date: Optional[str] - min_xsd_version: Optional[str] - max_xsd_version: Optional[str] - - -class ConceptualMappingMetadata(MappingSuiteComponent): - """ - - """ - identifier: Optional[str] - title: Optional[str] - description: Optional[str] - mapping_version: Optional[str] - epo_version: Optional[str] - base_xpath: Optional[str] - metadata_constraints: Optional[ConceptualMappingMetadataConstraints] - - -class ConceptualMappingRule(MappingSuiteComponent): - """ - - """ - standard_form_field_id: Optional[str] - standard_form_field_name: Optional[str] - eform_bt_id: Optional[str] - eform_bt_name: Optional[str] - field_xpath: Optional[List[str]] - field_xpath_condition: Optional[List[str]] - class_path: Optional[List[str]] - property_path: Optional[List[str]] - triple_fingerprint: Optional[List[str]] - fragment_fingerprint: Optional[List[str]] - - -class ConceptualMappingResource(MappingSuiteComponent): - """ - - """ - file_name: Optional[str] - - -class ConceptualMappingRMLModule(MappingSuiteComponent): - """ - - """ - file_name: Optional[str] - - -class ConceptualMappingRemark(MappingSuiteComponent): - """ - - """ - standard_form_field_id: Optional[str] - standard_form_field_name: Optional[str] - field_xpath: Optional[List[str]] - - -class ConceptualMappingControlList(MappingSuiteComponent): - """ - - """ - field_value: Optional[str] - mapping_reference: Optional[str] - super_type: Optional[str] - xml_path_fragment: Optional[str] - - -class ConceptualMapping(MappingSuiteComponent): - """ - - """ - xpaths: List[ConceptualMappingXPATH] = [] - metadata: Optional[ConceptualMappingMetadata] - rules: List[ConceptualMappingRule] = [] - mapping_remarks: List[ConceptualMappingRemark] = [] - resources: List[ConceptualMappingResource] = [] - rml_modules: List[ConceptualMappingRMLModule] = [] - cl1_roles: List[ConceptualMappingControlList] = [] - cl2_organisations: List[ConceptualMappingControlList] = [] - - class MappingSuiteType(str, Enum): STANDARD_FORMS = "standard_forms" ELECTRONIC_FORMS = "eforms" @@ -236,7 +126,6 @@ class MappingSuite(MappingSuiteComponent): shacl_test_suites: List[SHACLTestSuite] sparql_test_suites: List[SPARQLTestSuite] transformation_test_data: TransformationTestData - conceptual_mapping: Optional[ConceptualMapping] def get_mongodb_id(self) -> str: return f"{self.identifier}_v{self.version}" diff --git a/ted_sws/data_manager/adapters/mapping_suite_repository.py b/ted_sws/data_manager/adapters/mapping_suite_repository.py index faffaa6a2..284f01ee2 100644 --- a/ted_sws/data_manager/adapters/mapping_suite_repository.py +++ b/ted_sws/data_manager/adapters/mapping_suite_repository.py @@ -9,11 +9,10 @@ from ted_sws import config from ted_sws.core.model.transform import MappingSuite, FileResource, TransformationRuleSet, SHACLTestSuite, \ - SPARQLTestSuite, MetadataConstraints, TransformationTestData, ConceptualMapping, MappingSuiteType, \ + SPARQLTestSuite, MetadataConstraints, TransformationTestData, MappingSuiteType, \ MetadataConstraintsStandardForm, MetadataConstraintsEform from ted_sws.data_manager.adapters import inject_date_string_fields, remove_date_string_fields from ted_sws.data_manager.adapters.repository_abc import MappingSuiteRepositoryABC -from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import ConceptualMappingReader MS_METADATA_FILE_NAME = "metadata.json" MS_TRANSFORM_FOLDER_NAME = "transformation" @@ -184,11 +183,6 @@ def _read_sparql_test_suites(self, package_path: pathlib.Path) -> List[SPARQLTes sparql_tests=self._read_file_resources(path=sparql_test_suite_path)) for sparql_test_suite_path in sparql_test_suite_paths] - @classmethod - def _read_conceptual_mapping(cls, package_path: pathlib.Path) -> ConceptualMapping: - return ConceptualMappingReader.mapping_suite_read_conceptual_mapping( - package_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME) - def _write_package_metadata(self, mapping_suite: MappingSuite): """ This method creates the metadata of a package based on the metadata in the mapping_suite. @@ -354,27 +348,31 @@ def _read_mapping_suite_package(self, mapping_suite_identifier: str) -> Optional package_metadata = self._read_package_metadata(package_path) if MS_MAPPING_TYPE_KEY in package_metadata and package_metadata[ MS_MAPPING_TYPE_KEY] == MappingSuiteType.ELECTRONIC_FORMS: - package_metadata[MS_METADATA_CONSTRAINTS_KEY] = MetadataConstraints(constraints=MetadataConstraintsEform( - **package_metadata[MS_METADATA_CONSTRAINTS_KEY][MS_CONSTRAINTS_KEY])) + package_metadata[MS_METADATA_CONSTRAINTS_KEY] = MetadataConstraints( + constraints=MetadataConstraintsEform( + **package_metadata[MS_METADATA_CONSTRAINTS_KEY][MS_CONSTRAINTS_KEY])) else: - package_metadata[MS_METADATA_CONSTRAINTS_KEY] = MetadataConstraints(constraints=MetadataConstraintsStandardForm( - **package_metadata[MS_METADATA_CONSTRAINTS_KEY][MS_CONSTRAINTS_KEY])) - mapping_suite = MappingSuite(metadata_constraints=package_metadata[MS_METADATA_CONSTRAINTS_KEY], - created_at=package_metadata[MS_CREATED_AT_KEY], - title=package_metadata[MS_TITLE_KEY], - ontology_version=package_metadata[MS_ONTOLOGY_VERSION_KEY], - mapping_suite_hash_digest=package_metadata[MS_HASH_DIGEST_KEY], - mapping_type=package_metadata[MS_MAPPING_TYPE_KEY] if MS_MAPPING_TYPE_KEY in package_metadata else MappingSuiteType.STANDARD_FORMS, - version=package_metadata[ - MS_STANDARD_METADATA_VERSION_KEY] if MS_STANDARD_METADATA_VERSION_KEY in package_metadata else \ - package_metadata[MS_EFORMS_METADATA_VERSION_KEY], - identifier=package_metadata[ - MS_METADATA_IDENTIFIER_KEY] if MS_METADATA_IDENTIFIER_KEY in package_metadata else mapping_suite_identifier, - transformation_rule_set=self._read_transformation_rule_set(package_path), - shacl_test_suites=self._read_shacl_test_suites(package_path), - sparql_test_suites=self._read_sparql_test_suites(package_path), - transformation_test_data=self._read_test_data_package(package_path), - conceptual_mapping=self._read_conceptual_mapping(package_path)) #TODO remove conceptual_mapping value assignment when conceptual mapping reader is removed + package_metadata[MS_METADATA_CONSTRAINTS_KEY] = MetadataConstraints( + constraints=MetadataConstraintsStandardForm( + **package_metadata[MS_METADATA_CONSTRAINTS_KEY][MS_CONSTRAINTS_KEY])) + mapping_suite = MappingSuite( + metadata_constraints=package_metadata[MS_METADATA_CONSTRAINTS_KEY], + created_at=package_metadata[MS_CREATED_AT_KEY], + title=package_metadata[MS_TITLE_KEY], + ontology_version=package_metadata[MS_ONTOLOGY_VERSION_KEY], + mapping_suite_hash_digest=package_metadata[MS_HASH_DIGEST_KEY], + mapping_type=package_metadata[ + MS_MAPPING_TYPE_KEY] if MS_MAPPING_TYPE_KEY in package_metadata else MappingSuiteType.STANDARD_FORMS, + version=package_metadata[ + MS_STANDARD_METADATA_VERSION_KEY] if MS_STANDARD_METADATA_VERSION_KEY in package_metadata else \ + package_metadata[MS_EFORMS_METADATA_VERSION_KEY], + identifier=package_metadata[ + MS_METADATA_IDENTIFIER_KEY] if MS_METADATA_IDENTIFIER_KEY in package_metadata else mapping_suite_identifier, + transformation_rule_set=self._read_transformation_rule_set(package_path), + shacl_test_suites=self._read_shacl_test_suites(package_path), + sparql_test_suites=self._read_sparql_test_suites(package_path), + transformation_test_data=self._read_test_data_package(package_path) + ) return mapping_suite return None diff --git a/ted_sws/data_sampler/services/notice_xml_indexer.py b/ted_sws/data_sampler/services/notice_xml_indexer.py index 8adf31100..31bed854e 100644 --- a/ted_sws/data_sampler/services/notice_xml_indexer.py +++ b/ted_sws/data_sampler/services/notice_xml_indexer.py @@ -11,7 +11,6 @@ from ted_sws.core.model.metadata import XMLMetadata from ted_sws.core.model.notice import Notice from ted_sws.data_manager.adapters.notice_repository import NoticeRepository -from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import ConceptualMappingReader from ted_sws.resources import XSLT_FILES_PATH UNIQUE_XPATHS_XSLT_FILE_PATH = "get_unique_xpaths.xsl" @@ -109,20 +108,9 @@ def index_eforms_notice(notice: Notice) -> Notice: return notice -def index_notice(notice: Notice, base_xpath="") -> Notice: - # To be removed later if will not be used - # def _notice_namespaces(xml_file) -> dict: - # _namespaces = dict([node for _, node in XMLElementTree.iterparse(xml_file, events=['start-ns'])]) - # return {v: k for k, v in _namespaces.items()} - +def index_notice(notice: Notice) -> Notice: def _ns_tag(ns_tag): tag = ns_tag[1] - # Use just the tag, ignoring the namespace - # ns = ns_tag[0] - # if ns: - # ns_alias = namespaces[ns] - # if ns_alias: - # return ns_alias + ":" + tag return tag def _xpath_generator(xml_file): @@ -135,20 +123,17 @@ def _xpath_generator(xml_file): xpath = "/" + '/'.join(path) - if xpath.startswith(ConceptualMappingReader.base_xpath_as_prefix(base_xpath)): - attributes = list(el.attrib.keys()) - if len(attributes) > 0: - for attr in attributes: - yield xpath + "/@" + attr - yield xpath + attributes = list(el.attrib.keys()) + if len(attributes) > 0: + for attr in attributes: + yield xpath + "/@" + attr + yield xpath else: path.pop() with tempfile.NamedTemporaryFile() as fp: fp.write(notice.xml_manifestation.object_data.encode("utf-8")) - # Not used for the moment (to be removed in the future if feature is not wanted back) - # namespaces = _notice_namespaces(fp.name) xpaths = list(set(_xpath_generator(fp.name))) xml_metadata = XMLMetadata() xml_metadata.unique_xpaths = xpaths diff --git a/ted_sws/mapping_suite_processor/adapters/conceptual_mapping_reader.py b/ted_sws/mapping_suite_processor/adapters/conceptual_mapping_reader.py deleted file mode 100644 index 62e2f5625..000000000 --- a/ted_sws/mapping_suite_processor/adapters/conceptual_mapping_reader.py +++ /dev/null @@ -1,294 +0,0 @@ -from pathlib import Path -from typing import Dict, Union, List - -import numpy as np -import pandas as pd - -from ted_sws.core.model.transform import ConceptualMapping, ConceptualMappingXPATH, ConceptualMappingMetadata, \ - ConceptualMappingResource, ConceptualMappingMetadataConstraints, ConceptualMappingRule, \ - ConceptualMappingRMLModule, ConceptualMappingRemark, ConceptualMappingControlList -from ted_sws.mapping_suite_processor import CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME, \ - CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME, RULES_FIELD_XPATH, RULES_SF_FIELD_ID, RULES_SF_FIELD_NAME, \ - CONCEPTUAL_MAPPINGS_RESOURCES_SHEET_NAME, CONCEPTUAL_MAPPINGS_RML_MODULES_SHEET_NAME, RULES_E_FORM_BT_ID, \ - RULES_E_FORM_BT_NAME, RULES_FIELD_XPATH_CONDITION, CONCEPTUAL_MAPPINGS_REMARKS_SHEET_NAME, \ - CONCEPTUAL_MAPPINGS_CL2_ORGANISATIONS_SHEET_NAME, CONCEPTUAL_MAPPINGS_CL1_ROLES_SHEET_NAME, CL_MAPPING_REFERENCE, \ - CL_SUPERTYPE, CL_FIELD_VALUE, CL_XML_PATH_FRAGMENT -from ted_sws.notice_validator import BASE_XPATH_FIELD - -# This set of constants refers to fields in the Conceptual Mapping file -VERSION_FIELD = 'Mapping Version' -EPO_VERSION_FIELD = 'EPO version' -DESCRIPTION_FIELD = "Description" -TITLE_FIELD = 'Title' -IDENTIFIER_FIELD = 'Identifier' -E_FORMS_SUBTYPE_FIELD = "eForms Subtype" -START_DATE_FIELD = "Start Date" -END_DATE_FIELD = "End Date" -MIN_XSD_VERSION_FIELD = "Min XSD Version" -MAX_XSD_VERSION_FIELD = "Max XSD Version" - -RULES_CLASS_PATH = 'Class path (M)' -RULES_PROPERTY_PATH = 'Property path (M)' - -FILE_NAME_KEY = "File name" -REF_INTEGRATION_TESTS_KEY = "Reference to Integration Tests (O)" - - -class ConceptualMappingReader: - """ - This adapter can be used to read a MappingSuite Conceptual Mapping - """ - - @classmethod - def base_xpath_as_prefix(cls, base_xpath: str) -> str: - return base_xpath + ("/" if not base_xpath.endswith("/") else "") - - @classmethod - def xpath_with_base(cls, xpath: str, base_xpath: str = "") -> str: - # xpath is absolute - if xpath.startswith("/"): - return xpath - base_xpath = cls.base_xpath_as_prefix(base_xpath) if xpath else base_xpath - return base_xpath + xpath - - @classmethod - def _read_pd_value(cls, value, default=""): - if pd.isna(value): - return default - return value - - @classmethod - def read_list_from_pd_value(cls, value) -> list: - if value and pd.notna(value): - return [x.strip() for x in str(value).split(',')] - return [] - - @classmethod - def _read_list_from_pd_multiline_value(cls, value: str) -> list: - if value and pd.notna(value): - return [x.strip() for x in str(value).split('\n')] - return [] - - @classmethod - def _df_to_dict(cls, df: pd.DataFrame, key: str) -> dict: - return df.copy().set_index(key).T.to_dict('list') - - @classmethod - def _df_to_list(cls, df: pd.DataFrame) -> list: - return df.copy().tolist() - - @classmethod - def mapping_suite_read_metadata(cls, conceptual_mappings_file_path: Path) -> Dict: - """ - This feature allows you to read the conceptual mapping metadata. - :param conceptual_mappings_file_path: - :return: - """ - with open(conceptual_mappings_file_path, 'rb') as excel_file: - metadata_df = pd.read_excel(excel_file, sheet_name=CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME) - metadata = cls._df_to_dict(metadata_df, 'Field') - - return metadata - - @classmethod - def _read_conceptual_mapping_metadata(cls, df: pd.DataFrame) -> ConceptualMappingMetadata: - """ - :param df: - :return: - """ - - raw_metadata = cls._df_to_dict(df, 'Field') - - metadata: ConceptualMappingMetadata = ConceptualMappingMetadata() - - metadata.identifier = cls._read_pd_value(raw_metadata[IDENTIFIER_FIELD][0]) - metadata.title = cls._read_pd_value(raw_metadata[TITLE_FIELD][0]) - metadata.description = cls._read_pd_value(raw_metadata[DESCRIPTION_FIELD][0]) - metadata.mapping_version = cls._read_pd_value(raw_metadata[VERSION_FIELD][0]) - metadata.epo_version = cls._read_pd_value(raw_metadata[EPO_VERSION_FIELD][0]) - metadata.base_xpath = cls._read_pd_value(raw_metadata[BASE_XPATH_FIELD][0]) - - metadata_constraints: ConceptualMappingMetadataConstraints = ConceptualMappingMetadataConstraints() - metadata_constraints.eforms_subtype = cls.read_list_from_pd_value(raw_metadata[E_FORMS_SUBTYPE_FIELD][0]) - metadata_constraints.start_date = str(cls._read_pd_value(raw_metadata[START_DATE_FIELD][0])) - metadata_constraints.end_date = str(cls._read_pd_value(raw_metadata[END_DATE_FIELD][0])) - metadata_constraints.min_xsd_version = cls._read_pd_value(raw_metadata[MIN_XSD_VERSION_FIELD][0]) - metadata_constraints.max_xsd_version = cls._read_pd_value(raw_metadata[MAX_XSD_VERSION_FIELD][0]) - metadata.metadata_constraints = metadata_constraints - - return metadata - - @classmethod - def _read_conceptual_mapping_rules(cls, df: pd.DataFrame) -> List[ConceptualMappingRule]: - """ - - :param df: - :return: - """ - - df.columns = df.iloc[0] - rules_df = df[1:].copy() - rules_df[RULES_SF_FIELD_ID].ffill(axis="index", inplace=True) - rules_df[RULES_SF_FIELD_NAME].ffill(axis="index", inplace=True) - - rules = [] - rule: ConceptualMappingRule - for idx, row in rules_df.iterrows(): - rule = ConceptualMappingRule() - rule.standard_form_field_id = cls._read_pd_value(row[RULES_SF_FIELD_ID]) - rule.standard_form_field_name = cls._read_pd_value(row[RULES_SF_FIELD_NAME]) - rule.eform_bt_id = cls._read_pd_value(row[RULES_E_FORM_BT_ID]) - rule.eform_bt_name = cls._read_pd_value(row[RULES_E_FORM_BT_NAME]) - rule.field_xpath = cls._read_list_from_pd_multiline_value(row[RULES_FIELD_XPATH]) - rule.field_xpath_condition = cls._read_list_from_pd_multiline_value(row[RULES_FIELD_XPATH_CONDITION]) - rule.class_path = cls._read_list_from_pd_multiline_value(row[RULES_CLASS_PATH]) - rule.property_path = cls._read_list_from_pd_multiline_value(row[RULES_PROPERTY_PATH]) - rules.append(rule) - return rules - - @classmethod - def _read_conceptual_mapping_remarks(cls, df: pd.DataFrame, base_xpath: str) -> List[ConceptualMappingRemark]: - """ - - :param df: - :return: - """ - - remarks_df = df[0:].copy() - remarks = [] - remark: ConceptualMappingRemark - for idx, row in remarks_df.iterrows(): - remark = ConceptualMappingRemark() - remark.standard_form_field_id = cls._read_pd_value(row[RULES_SF_FIELD_ID]) - remark.standard_form_field_name = cls._read_pd_value(row[RULES_SF_FIELD_NAME]) - remarked_xpaths = cls._read_list_from_pd_multiline_value(row[RULES_FIELD_XPATH]) - if remarked_xpaths: - remark.field_xpath = [] - for remarked_xpath in remarked_xpaths: - remark.field_xpath.append(cls.xpath_with_base(remarked_xpath, base_xpath)) - remarks.append(remark) - return remarks - - @classmethod - def _read_conceptual_mapping_resources(cls, df: pd.DataFrame) -> List[ConceptualMappingResource]: - """ - - :param df: - :return: - """ - - resources = [] - resource: ConceptualMappingResource - for value in list(df[FILE_NAME_KEY].values): - resource = ConceptualMappingResource() - resource.file_name = cls._read_pd_value(value) - resources.append(resource) - return resources - - @classmethod - def _read_conceptual_mapping_rml_modules(cls, df: pd.DataFrame) -> List[ConceptualMappingRMLModule]: - """ - - :param df: - :return: - """ - - rml_modules = [] - rml_module: ConceptualMappingRMLModule - for value in list(df[FILE_NAME_KEY].values): - rml_module = ConceptualMappingRMLModule() - rml_module.file_name = cls._read_pd_value(value) - rml_modules.append(rml_module) - return rml_modules - - @classmethod - def _read_conceptual_mapping_control_list(cls, df: pd.DataFrame) -> List[ConceptualMappingControlList]: - """ - - :param df: - :return: - """ - - df.columns = df.iloc[0] - control_list_df = df[1:].copy() - - control_list = [] - item: ConceptualMappingControlList - for idx, row in control_list_df.iterrows(): - item = ConceptualMappingControlList() - item.field_value = cls._read_pd_value(row[CL_FIELD_VALUE]) - item.mapping_reference = cls._read_pd_value(row[CL_MAPPING_REFERENCE]) - item.super_type = cls._read_pd_value(row[CL_SUPERTYPE]) - item.xml_path_fragment = cls._read_pd_value(row[CL_XML_PATH_FRAGMENT]) - control_list.append(item) - return control_list - - @classmethod - def _read_conceptual_mapping_xpaths(cls, rules_df: pd.DataFrame, base_xpath: str) -> List[ConceptualMappingXPATH]: - """ - - :param rules_df: - :param base_xpath: - :return: - """ - - xpaths = [] - rules_df[RULES_SF_FIELD_ID].ffill(axis="index", inplace=True) - rules_df[RULES_SF_FIELD_NAME].ffill(axis="index", inplace=True) - df_xpaths = cls._df_to_list(rules_df[RULES_FIELD_XPATH]) - df_sform_field_names = rules_df[RULES_SF_FIELD_NAME].tolist() - df_sform_field_ids = rules_df[RULES_SF_FIELD_ID].tolist() - processed_xpaths = set() - for idx, xpath_row in enumerate(df_xpaths): - if xpath_row is not np.nan: - row_xpaths = xpath_row.split('\n') - for xpath in row_xpaths: - if xpath: - xpath = cls.xpath_with_base(xpath, base_xpath) - if xpath not in processed_xpaths: - form_fields = [df_sform_field_ids[idx], df_sform_field_names[idx]] - cm_xpath: ConceptualMappingXPATH = ConceptualMappingXPATH( - xpath=xpath, - form_field=" - ".join([item for item in form_fields if not pd.isnull(item)]) - ) - xpaths.append(cm_xpath) - processed_xpaths.add(xpath) - - return xpaths - - @classmethod - def mapping_suite_read_conceptual_mapping(cls, conceptual_mappings_file_path: Path) -> \ - Union[ConceptualMapping, None]: - """ - This feature allows you to read the conceptual mapping in a package. - :param conceptual_mappings_file_path: - :return: - """ - - if not conceptual_mappings_file_path.exists(): - return None - - conceptual_mapping: ConceptualMapping = ConceptualMapping() - - with open(conceptual_mappings_file_path, 'rb') as excel_file: - dfs = pd.read_excel(excel_file, sheet_name=None) - - metadata = cls._read_conceptual_mapping_metadata(dfs[CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME]) - conceptual_mapping.metadata = metadata - conceptual_mapping.rules = cls._read_conceptual_mapping_rules(dfs[CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME]) - conceptual_mapping.mapping_remarks = cls._read_conceptual_mapping_remarks( - dfs[CONCEPTUAL_MAPPINGS_REMARKS_SHEET_NAME], base_xpath=metadata.base_xpath) - conceptual_mapping.resources = cls._read_conceptual_mapping_resources( - dfs[CONCEPTUAL_MAPPINGS_RESOURCES_SHEET_NAME]) - conceptual_mapping.rml_modules = cls._read_conceptual_mapping_rml_modules( - dfs[CONCEPTUAL_MAPPINGS_RML_MODULES_SHEET_NAME]) - conceptual_mapping.cl1_roles = cls._read_conceptual_mapping_control_list( - dfs[CONCEPTUAL_MAPPINGS_CL1_ROLES_SHEET_NAME]) - conceptual_mapping.cl2_organisations = cls._read_conceptual_mapping_control_list( - dfs[CONCEPTUAL_MAPPINGS_CL2_ORGANISATIONS_SHEET_NAME]) - conceptual_mapping.xpaths = cls._read_conceptual_mapping_xpaths( - rules_df=dfs[CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME][1:].copy(), - base_xpath=metadata.base_xpath - ) - - return conceptual_mapping diff --git a/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py b/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py new file mode 100644 index 000000000..c635cee6e --- /dev/null +++ b/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py @@ -0,0 +1,79 @@ +import json +from pathlib import Path +from typing import Dict, List, Tuple + +from ted_sws.core.model.transform import MappingXPATH, MappingSuite +from ted_sws.data_manager.adapters.mapping_suite_repository import MS_METADATA_FILE_NAME, MS_VALIDATE_FOLDER_NAME, \ + MS_SPARQL_FOLDER_NAME + +# This set of constants refers to fields in the Conceptual Mapping file +VERSION_FIELD = 'Mapping Version' + +CONCEPTUAL_MAPPINGS_ASSERTIONS = "cm_assertions" + +SPARQL_QUERY_METADATA_TITLE = "title" +SPARQL_QUERY_METADATA_DESCRIPTION = "description" +SPARQL_QUERY_METADATA_XPATH = "xpath" + + +class MappingSuiteReader: + """ + This adapter can be used to read different MappingSuite data + """ + + @classmethod + def mapping_suite_read_metadata(cls, mapping_suite_path: Path) -> Dict: + """ + This feature allows you to read the conceptual mapping metadata. + :param mapping_suite_path: + :return: + """ + with open(mapping_suite_path / MS_METADATA_FILE_NAME) as metadata_file: + metadata = json.load(metadata_file) + + return metadata + + @classmethod + def extract_metadata_from_sparql_query(cls, content) -> dict: + """ + Extracts a dictionary of metadata from a SPARQL query + """ + + def _process_line(line) -> Tuple[str, str]: + if ":" in line: + key_part, value_part = line.split(":", 1) + key_part = key_part.replace("#", "").strip() + value_part = value_part.strip() + return key_part, value_part + + content_lines_with_comments = filter(lambda x: x.strip().startswith("#"), content.splitlines()) + return dict([_process_line(line) for line in content_lines_with_comments]) + + @classmethod + def read_mapping_suite_xpaths(cls, mapping_suite: MappingSuite) -> List[MappingXPATH]: + """ + + :param mapping_suite: + :return: + """ + + xpaths = [] + processed_xpaths = set() + + for test_suite in mapping_suite.sparql_test_suites: + if test_suite != CONCEPTUAL_MAPPINGS_ASSERTIONS: + continue + + for sparql_test in test_suite.sparql_tests: + metadata = cls.extract_metadata_from_sparql_query(sparql_test.file_content) + xpath = metadata[SPARQL_QUERY_METADATA_XPATH] + if xpath not in processed_xpaths: + cm_xpath: MappingXPATH = MappingXPATH( + xpath=xpath, + form_field=metadata[SPARQL_QUERY_METADATA_TITLE] + ) + xpaths.append(cm_xpath) + processed_xpaths.add(xpath) + break + + return xpaths diff --git a/ted_sws/mapping_suite_processor/adapters/mapping_suite_structure_checker.py b/ted_sws/mapping_suite_processor/adapters/mapping_suite_structure_checker.py index 63af85d78..d2dfb6580 100644 --- a/ted_sws/mapping_suite_processor/adapters/mapping_suite_structure_checker.py +++ b/ted_sws/mapping_suite_processor/adapters/mapping_suite_structure_checker.py @@ -9,9 +9,8 @@ from ted_sws.event_manager.model.event_message import EventMessage, EventMessageLogSettings from ted_sws.event_manager.services.logger_from_context import get_console_logger from ted_sws.mapping_suite_processor.adapters.mapping_suite_hasher import MappingSuiteHasher -from ted_sws.mapping_suite_processor.services.conceptual_mapping_generate_metadata import VERSION_FIELD, \ +from ted_sws.mapping_suite_processor.services.mapping_suite_reader import mapping_suite_read_metadata, \ MAPPING_SUITE_HASH, VERSION_KEY -from ted_sws.mapping_suite_processor.services.conceptual_mapping_reader import mapping_suite_read_metadata SHACL_KEYWORD = "shacl" SPARQL_KEYWORD = "sparql" @@ -127,48 +126,11 @@ def validate_output_structure(self) -> bool: return success - def check_metadata_consistency(self, package_metadata_path=None) -> bool: - - """ - Read the conceptual mapping XSLX and the metadata.json and compare the contents, - in particular paying attention to the mapping suite version and the ontology version. - """ - self.logger.info( - event_message=EventMessage( - message="Read the conceptual mapping XSLX and the metadata.json and compare the contents."), - settings=self.log_settings) - success = True - - conceptual_mappings_document = mapping_suite_read_metadata( - conceptual_mappings_file_path=self.mapping_suite_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME) - conceptual_mappings_version = [val for val in conceptual_mappings_document.values()][4][0] - conceptual_mappings_epo_version = [val for val in conceptual_mappings_document.values()][5][0] - - if package_metadata_path is None: - package_metadata_path = self.mapping_suite_path / MS_METADATA_FILE_NAME - package_metadata_content = package_metadata_path.read_text(encoding="utf-8") - package_metadata = json.loads(package_metadata_content) - package_metadata['metadata_constraints'] = MetadataConstraints(**package_metadata['metadata_constraints']) - metadata_version = [val for val in package_metadata.values()][3] - metadata_epo_version = [val for val in package_metadata.values()][4] - - if not ( - conceptual_mappings_version >= metadata_version - and conceptual_mappings_epo_version >= metadata_epo_version - ): - event_message = EventMessage( - message=f'Not the same value between metadata.json [version {metadata_version}, epo_version {metadata_epo_version}] and conceptual_mapping_file [version {conceptual_mappings_version}, epo_version {conceptual_mappings_epo_version}]') - self.logger.error(event_message=event_message, settings=self.log_settings) - success = False - - return success - def check_for_changes_by_version(self) -> bool: """ This function check whether the mapping suite is well versioned and no changes detected. We want to ensure that: - - the version in the metadata.json is the same as the version in the conceptual mappings - the version in always incremented - the changes in the mapping suite are detected by comparison to the hash in the metadata.json - the hash is bound to a version of the mapping suite written in the conceptual mappings @@ -181,23 +143,21 @@ def check_for_changes_by_version(self) -> bool: settings=self.log_settings) success = True - conceptual_mapping_metadata = mapping_suite_read_metadata( - conceptual_mappings_file_path=self.mapping_suite_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME) - - metadata_json = json.loads((self.mapping_suite_path / MS_METADATA_FILE_NAME).read_text()) + metadata = mapping_suite_read_metadata(mapping_suite_path=self.mapping_suite_path) - version_in_cm = conceptual_mapping_metadata[VERSION_FIELD][0] + version = metadata.get(VERSION_KEY) mapping_suite_versioned_hash = MappingSuiteHasher(self.mapping_suite_path).hash_mapping_suite( - with_version=version_in_cm) - - if mapping_suite_versioned_hash != metadata_json.get(MAPPING_SUITE_HASH): - self.logger.error(event_message=EventMessage( - message=f'The Mapping Suite hash digest ({mapping_suite_versioned_hash}) and the Version from the ' - f'Conceptual Mappings ({version_in_cm}) ' - f'does not correspond to the ones in the metadata.json file ' - f'({metadata_json.get(MAPPING_SUITE_HASH)}, {metadata_json.get(VERSION_KEY)}). ' - f'Consider increasing the version and regenerating the metadata.json'), - settings=self.log_settings) + with_version=version) + + if mapping_suite_versioned_hash != metadata.get(MAPPING_SUITE_HASH): + self.logger.error( + event_message=EventMessage( + message=f'The Mapping Suite hash digest ({mapping_suite_versioned_hash}) ' + f'does not correspond to the one in the metadata.json file ' + f'({metadata.get(MAPPING_SUITE_HASH)}.' + ), + settings=self.log_settings + ) success = False return success @@ -206,12 +166,10 @@ def is_valid(self) -> bool: validate_core_structure: bool = self.validate_core_structure() validate_expanded_structure: bool = self.validate_expanded_structure() validate_output_structure: bool = self.validate_output_structure() - check_metadata_consistency: bool = self.check_metadata_consistency() check_for_changes_by_version: bool = self.check_for_changes_by_version() return \ - validate_core_structure \ - and validate_expanded_structure \ - and validate_output_structure \ - and check_metadata_consistency \ - and check_for_changes_by_version + validate_core_structure \ + and validate_expanded_structure \ + and validate_output_structure \ + and check_for_changes_by_version diff --git a/ted_sws/mapping_suite_processor/services/conceptual_mapping_differ.py b/ted_sws/mapping_suite_processor/services/conceptual_mapping_differ.py deleted file mode 100644 index 3fc5b0be2..000000000 --- a/ted_sws/mapping_suite_processor/services/conceptual_mapping_differ.py +++ /dev/null @@ -1,322 +0,0 @@ -import tempfile -from pathlib import Path -from typing import List -from urllib.request import urlopen - -from deepdiff import DeepDiff -from jinja2 import Environment, PackageLoader -from json2html import json2html -from pydantic.utils import deep_update - -from ted_sws import config -from ted_sws.core.model.transform import ConceptualMapping, ConceptualMappingRule, ConceptualMappingRemark, \ - ConceptualMappingResource, ConceptualMappingRMLModule -from ted_sws.core.model.transform import ConceptualMappingDiff, ConceptualMappingDiffMetadata, ConceptualMappingDiffData -from ted_sws.data_manager.adapters.mapping_suite_repository import MS_TRANSFORM_FOLDER_NAME, \ - MS_CONCEPTUAL_MAPPING_FILE_NAME -from ted_sws.mapping_suite_processor.services.conceptual_mapping_reader import mapping_suite_read_conceptual_mapping - -TEMPLATES = Environment(loader=PackageLoader("ted_sws.mapping_suite_processor.resources", "templates")) -CONCEPTUAL_MAPPINGS_DIFF_HTML_REPORT_TEMPLATE = "conceptual_mappings_diff_report.jinja2" - -GITHUB_CONCEPTUAL_MAPPINGS_PATH = "{GITHUB_BASE}/raw/{GIT_BRANCH}/mappings/{MAPPING_SUITE_ID}/" + \ - MS_TRANSFORM_FOLDER_NAME + "/" + MS_CONCEPTUAL_MAPPING_FILE_NAME - -DEFAULT_REPORT_FILE_NAME = "cm_diff" -DIFF_VALUE_CONTEXT_KEY = "__CONTEXT__" - -DIFF_METADATA_TAB = "metadata" -DIFF_RULES_TAB = "rules" -DIFF_MAPPING_REMARKS_TAB = "mapping_remarks" -DIFF_RESOURCES_TAB = "resources" -DIFF_RML_MODULES_TAB = "rml_modules" -DIFF_CL1_ROLES_TAB = "cl1_roles" -DIFF_CL2_ORGANISATIONS_TAB = "cl2_organisations" - - -class ConceptualMappingDiffDataTransformer: - data: dict - tabs: dict = { - DIFF_METADATA_TAB: {}, - DIFF_RULES_TAB: {}, - DIFF_MAPPING_REMARKS_TAB: {}, - DIFF_RESOURCES_TAB: {}, - DIFF_RML_MODULES_TAB: {}, - DIFF_CL1_ROLES_TAB: {}, - DIFF_CL2_ORGANISATIONS_TAB: {} - } - labels: dict - mapping1: ConceptualMapping - mapping2: ConceptualMapping - context_mapping: ConceptualMapping - - item_key_flattenizer: str = "|" - - def __init__(self, data, mapping1: ConceptualMapping, mapping2: ConceptualMapping): - self.data = data - self.mapping1 = mapping1 - self.mapping2 = mapping2 - self.context_mapping = self.mapping2 - self.init_labels() - self.init_tabs() - self.process_tabs_data() - - @classmethod - def init_labels(cls): - cls.labels = { - "tabs": { - DIFF_METADATA_TAB: "Metadata", - DIFF_RULES_TAB: "Rules", - DIFF_MAPPING_REMARKS_TAB: "Remarks", - DIFF_RESOURCES_TAB: "Resources", - DIFF_RML_MODULES_TAB: "RML Modules", - DIFF_CL1_ROLES_TAB: "CL1 Roles", - DIFF_CL2_ORGANISATIONS_TAB: "CL2 Organisations" - }, - "actions": { - "set_item_added": "Set Added", - "set_item_removed": "Set Removed", - "iterable_item_removed": "Removed", - "iterable_item_added": "Added", - "iterable_item_moved": "Moved", - "values_changed": "Changed" - }, - "fields": { - "identifier": "Identifier", - "title": "Title", - "description": "Description", - "mapping_version": "Mapping Version", - "epo_version": "EPO version", - "base_xpath": "Base XPath", - "metadata_constraints": "Metadata constraints", - "eforms_subtype": "eForms Subtype", - "start_date": "Start Date", - "end_date": "End Date", - "min_xsd_version": "Min XSD Version", - "max_xsd_version": "Max XSD Version", - "standard_form_field_id": "Standard Form Field ID (M)", - "standard_form_field_name": "Standard Form Field Name (M)", - "eform_bt_id": "eForm BT-ID (O)", - "eform_bt_name": "eForm BT Name (O)", - "field_xpath": "Field XPath (M)", - "field_xpath_condition": "Field XPath condition (M)", - "class_path": "Class path (M)", - "property_path": "Property path (M)", - "triple_fingerprint": "Triple Fingerprint", - "fragment_fingerprint": "Fragment Fingerprint", - "file_name": "File name", - "old_value": "Old value", - "new_value": "New value", - "field_value": "Field Value (in XML)", - "mapping_reference": "Mapping Reference (in ePO)", - "super_type": "SuperType", - "xml_path_fragment": "XML PATH Fragment" - } - } - - def init_tabs(self): - for action in self.data: - action_items = self.unflatten(self.data[action]) - for tab in action_items: - if tab not in self.tabs: - continue - if action not in self.tabs[tab]: - self.tabs[tab][action] = {} - self.tabs[tab][action] = deep_update(self.tabs[tab][action], action_items[tab]) - - def process_tabs_data(self): - self.process_rules_tab() - self.process_mapping_remarks_tab() - self.process_resources_tab() - self.process_rml_modules_tab() - - def process_rules_tab(self): - cm_rules_len = len(self.context_mapping.rules) - for action in self.tabs[DIFF_RULES_TAB]: - for row_idx in self.tabs[DIFF_RULES_TAB][action]: - idx = int(row_idx) - if idx < cm_rules_len: - cm_row: ConceptualMappingRule = self.context_mapping.rules[idx] - context = [cm_row.standard_form_field_id, cm_row.standard_form_field_name] - self.tabs[DIFF_RULES_TAB][action][row_idx][DIFF_VALUE_CONTEXT_KEY] = context - - def process_mapping_remarks_tab(self): - cm_mapping_remarks_len = len(self.context_mapping.mapping_remarks) - for action in self.tabs[DIFF_MAPPING_REMARKS_TAB]: - for row_idx in self.tabs[DIFF_MAPPING_REMARKS_TAB][action]: - idx = int(row_idx) - if idx < cm_mapping_remarks_len: - cm_row: ConceptualMappingRemark = self.context_mapping.mapping_remarks[idx] - context = [cm_row.standard_form_field_id, cm_row.standard_form_field_name] - self.tabs[DIFF_MAPPING_REMARKS_TAB][action][row_idx][DIFF_VALUE_CONTEXT_KEY] = context - - def process_resources_tab(self): - cm_resources_len = len(self.context_mapping.resources) - for action in self.tabs[DIFF_RESOURCES_TAB]: - for row_idx in self.tabs[DIFF_RESOURCES_TAB][action]: - idx = int(row_idx) - if idx < cm_resources_len: - cm_row: ConceptualMappingResource = self.context_mapping.resources[idx] - context = [cm_row.file_name] - self.tabs[DIFF_RESOURCES_TAB][action][row_idx][DIFF_VALUE_CONTEXT_KEY] = context - - def process_rml_modules_tab(self): - cm_rml_modules_len = len(self.context_mapping.rml_modules) - for action in self.tabs[DIFF_RML_MODULES_TAB]: - for row_idx in self.tabs[DIFF_RML_MODULES_TAB][action]: - idx = int(row_idx) - if idx < cm_rml_modules_len: - cm_row: ConceptualMappingRMLModule = self.context_mapping.rml_modules[idx] - context = [cm_row.file_name] - self.tabs[DIFF_RML_MODULES_TAB][action][row_idx][DIFF_VALUE_CONTEXT_KEY] = context - - @classmethod - def normalize_item_key(cls, k): - return cls.item_key_flattenizer.join(k.replace("'", "").split("root[", 1)[1].rsplit("]", 1)[0].split("][")) - - @classmethod - def unflatten(cls, d): - ud = {} - for k, v in d.items(): - context = ud - k = cls.normalize_item_key(k) - for sub_key in k.split(cls.item_key_flattenizer)[:-1]: - if sub_key not in context: - context[sub_key] = {} - context = context[sub_key] - context[k.split(cls.item_key_flattenizer)[-1]] = v - return ud - - -def mapping_suite_diff_conceptual_mappings(mappings: List[ConceptualMapping]) -> dict: - """ - This service return the difference between 2 Mapping Suite's conceptual mapping objects - :param mappings: - :return: - """ - assert mappings and len(mappings) == 2 - diff: ConceptualMappingDiff = ConceptualMappingDiff() - diff.metadata = ConceptualMappingDiffMetadata( - defaults={ - "branch": "local", - "conceptual_mapping": MS_TRANSFORM_FOLDER_NAME + "/" + MS_CONCEPTUAL_MAPPING_FILE_NAME - }, - metadata=[ - mappings[0].metadata.dict(), - mappings[1].metadata.dict() - ] - ) - mapping1: dict = mappings[0].dict() - mapping2: dict = mappings[1].dict() - - diff.data = transform_conceptual_mappings_diff_data(ConceptualMappingDiffData( - original=DeepDiff(mapping1, mapping2, ignore_order=False) - ), mapping1=mappings[0], mapping2=mappings[1]) - return diff.dict() - - -def mapping_suite_diff_files_conceptual_mappings(filepaths: List[Path]) -> dict: - """ - This service return the difference between 2 Mapping Suite's conceptual mapping objects - based on their filepaths - :param filepaths: - :return: - """ - assert filepaths and len(filepaths) == 2 - assert filepaths[0].is_file() - assert filepaths[1].is_file() - return mapping_suite_diff_conceptual_mappings([ - mapping_suite_read_conceptual_mapping(filepaths[0]), - mapping_suite_read_conceptual_mapping(filepaths[1]) - ]) - - -def mapping_suite_diff_repo_conceptual_mappings(branch_or_tag_name: List[str], mapping_suite_id: List[str], - filepath: Path = None) -> dict: - """ - This service return the difference between 2 Mapping Suite's conceptual mapping objects - based on their repository branch - - 1) repo vs file - 2) repo vs repo - - :param mapping_suite_id: - :param branch_or_tag_name: - :param filepath: - :return: - """ - - assert branch_or_tag_name and len(branch_or_tag_name) > 0 - assert mapping_suite_id and len(mapping_suite_id) > 0 - - git_extension = ".git" - github_base = config.GITHUB_TED_SWS_ARTEFACTS_URL - if github_base.endswith(git_extension): - github_base = github_base[:-(len(git_extension))] - - url_resource = urlopen(GITHUB_CONCEPTUAL_MAPPINGS_PATH.format( - GITHUB_BASE=github_base, - GIT_BRANCH=branch_or_tag_name[0], - MAPPING_SUITE_ID=mapping_suite_id[0] - )) - temp_file1 = tempfile.NamedTemporaryFile() - temp_file1.write(url_resource.read()) - filepath1 = Path(temp_file1.name) - - if filepath: - assert filepath.is_file() - filepath2 = filepath - else: - if len(branch_or_tag_name) < 2: - branch_or_tag_name.append(branch_or_tag_name[0]) - - if len(mapping_suite_id) < 2: - mapping_suite_id.append(mapping_suite_id[0]) - - url_resource = urlopen(GITHUB_CONCEPTUAL_MAPPINGS_PATH.format( - GITHUB_BASE=github_base, - GIT_BRANCH=branch_or_tag_name[1], - MAPPING_SUITE_ID=mapping_suite_id[1] - )) - temp_file2 = tempfile.NamedTemporaryFile() - temp_file2.write(url_resource.read()) - filepath2 = Path(temp_file2.name) - - return mapping_suite_diff_files_conceptual_mappings([filepath1, filepath2]) - - -def transform_conceptual_mappings_diff_data(diff_data: ConceptualMappingDiffData, mapping1: ConceptualMapping, - mapping2: ConceptualMapping): - diff_transformer = ConceptualMappingDiffDataTransformer(data=diff_data.original, mapping1=mapping1, - mapping2=mapping2) - diff_data.transformed = { - "labels": diff_transformer.labels, - "tabs": diff_transformer.tabs - } - return diff_data - - -def generate_conceptual_mappings_diff_html_report(diff: ConceptualMappingDiff): - diff.data.html = json2html.convert( - json=diff.data.original, - table_attributes='class="display dataTable heading"', - clubbing=True - ) - html_report = TEMPLATES.get_template(CONCEPTUAL_MAPPINGS_DIFF_HTML_REPORT_TEMPLATE).render(diff) - return html_report - - -def generate_conceptual_mappings_diff_filename(diff: ConceptualMappingDiff, prefix: str = DEFAULT_REPORT_FILE_NAME, - ext: str = None) -> str: - filename: str = prefix - cm1_metadata: dict = diff.metadata.metadata[0] - if cm1_metadata: - filename += f"_{cm1_metadata['identifier']}_v{cm1_metadata['mapping_version']}" - cm2_metadata: dict = diff.metadata.metadata[1] - if cm2_metadata: - if cm1_metadata: - filename += "_vs" - filename += f"_{cm2_metadata['identifier']}_v{cm2_metadata['mapping_version']}" - if ext: - filename += ext - return filename diff --git a/ted_sws/mapping_suite_processor/services/conceptual_mapping_files_injection.py b/ted_sws/mapping_suite_processor/services/conceptual_mapping_files_injection.py index 1d9034aaf..234ed0968 100644 --- a/ted_sws/mapping_suite_processor/services/conceptual_mapping_files_injection.py +++ b/ted_sws/mapping_suite_processor/services/conceptual_mapping_files_injection.py @@ -5,7 +5,7 @@ from ted_sws.mapping_suite_processor import CONCEPTUAL_MAPPINGS_RESOURCES_SHEET_NAME, \ CONCEPTUAL_MAPPINGS_RML_MODULES_SHEET_NAME, CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME -from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import FILE_NAME_KEY, REF_INTEGRATION_TESTS_KEY +from ted_sws.mapping_suite_processor.adapters.mapping_suite_reader import FILE_NAME_KEY, REF_INTEGRATION_TESTS_KEY def mapping_suite_processor_inject_resources(conceptual_mappings_file_path: pathlib.Path, diff --git a/ted_sws/mapping_suite_processor/services/conceptual_mapping_generate_metadata.py b/ted_sws/mapping_suite_processor/services/conceptual_mapping_generate_metadata.py deleted file mode 100644 index 1705365a4..000000000 --- a/ted_sws/mapping_suite_processor/services/conceptual_mapping_generate_metadata.py +++ /dev/null @@ -1,93 +0,0 @@ -import json -import pathlib -from datetime import datetime - -import pandas as pd - -from ted_sws.data_manager.adapters.mapping_suite_repository import MS_TRANSFORM_FOLDER_NAME, MS_METADATA_FILE_NAME, \ - MS_CONCEPTUAL_MAPPING_FILE_NAME -from ted_sws.mapping_suite_processor.adapters.mapping_suite_hasher import MappingSuiteHasher -from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import IDENTIFIER_FIELD, TITLE_FIELD, \ - DESCRIPTION_FIELD, VERSION_FIELD, EPO_VERSION_FIELD, E_FORMS_SUBTYPE_FIELD, START_DATE_FIELD, END_DATE_FIELD, \ - MIN_XSD_VERSION_FIELD, MAX_XSD_VERSION_FIELD -from ted_sws.mapping_suite_processor.services.conceptual_mapping_reader import mapping_suite_read_metadata - -# This set of constants refers to keys in metadata.json corresponding to the fields Conceptual Mapping file -E_FORMS_SUBTYPE_KEY = "eforms_subtype" -START_DATE_KEY = "start_date" -END_DATE_KEY = "end_date" -MIN_XSD_VERSION_KEY = "min_xsd_version" -MAX_XSD_VERSION_KEY = "max_xsd_version" -EFORMS_SDK_VERSIONS_KEY = "eforms_sdk_versions" -TITLE_KEY = "title" -CREATED_KEY = "created_at" -IDENTIFIER_KEY = "identifier" -VERSION_KEY = "version" -DESCRIPTION_KEY = "description" -ONTOLOGY_VERSION_KEY = "ontology_version" -METADATA_CONSTRAINTS_KEY = "metadata_constraints" -CONSTRAINTS_KEY = "constraints" -MAPPING_SUITE_HASH = "mapping_suite_hash_digest" - - -def generate_metadata(raw_metadata: dict) -> dict: - """ - This feature restructures the metadata into a default format. - Metadata is formed from 2 parts: metadata for mapping suite and constraints on the mapping suite - :param raw_metadata: - :return: - """ - - def get_list_from_raw_metadata(field_key: str) -> list: - data = raw_metadata[field_key][0] - if pd.notna(data): - return [x.strip() for x in str(data).split(',')] - else: - return [] - - constraints = { - E_FORMS_SUBTYPE_KEY: get_list_from_raw_metadata(E_FORMS_SUBTYPE_FIELD), - START_DATE_KEY: get_list_from_raw_metadata(START_DATE_FIELD), - END_DATE_KEY: get_list_from_raw_metadata(END_DATE_FIELD), - MIN_XSD_VERSION_KEY: get_list_from_raw_metadata(MIN_XSD_VERSION_FIELD), - MAX_XSD_VERSION_KEY: get_list_from_raw_metadata(MAX_XSD_VERSION_FIELD)} - - metadata = {TITLE_KEY: raw_metadata[TITLE_FIELD][0], IDENTIFIER_KEY: raw_metadata[IDENTIFIER_FIELD][0], - CREATED_KEY: datetime.now().isoformat(), VERSION_KEY: raw_metadata[VERSION_FIELD][0], - ONTOLOGY_VERSION_KEY: raw_metadata[EPO_VERSION_FIELD][0], - DESCRIPTION_KEY: raw_metadata[DESCRIPTION_FIELD][0], - METADATA_CONSTRAINTS_KEY: {CONSTRAINTS_KEY: constraints}, - } - return metadata - - -def mapping_suite_processor_generate_metadata(mapping_suite_path: pathlib.Path, - output_metadata_file_path: pathlib.Path = None, - conceptual_mappings_file_path: pathlib.Path = None): - """ - This function reads metadata from conceptual_mapping_file and generates metadata for a mapping suite package. - The result is written to the output_metadata_file file. - :param mapping_suite_path: - :param output_metadata_file_path: - :param conceptual_mappings_file_path: - :return: - """ - - if output_metadata_file_path is None: - output_metadata_file_path = mapping_suite_path / MS_METADATA_FILE_NAME - - if conceptual_mappings_file_path is None: - conceptual_mappings_file_path = mapping_suite_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME - - metadata = {} - raw_metadata = mapping_suite_read_metadata(conceptual_mappings_file_path) - conceptual_mapping_metadata = generate_metadata(raw_metadata=raw_metadata) - metadata.update(conceptual_mapping_metadata) - - hashing_metadata = {MAPPING_SUITE_HASH: MappingSuiteHasher(mapping_suite_path).hash_mapping_suite( - with_version=metadata[VERSION_KEY] - )} - metadata.update(hashing_metadata) - - with open(output_metadata_file_path, 'w') as metadata_file: - metadata_file.write(json.dumps(metadata, indent=2)) diff --git a/ted_sws/mapping_suite_processor/services/conceptual_mapping_generate_sparql_queries.py b/ted_sws/mapping_suite_processor/services/conceptual_mapping_generate_sparql_queries.py deleted file mode 100644 index 3b2838aa3..000000000 --- a/ted_sws/mapping_suite_processor/services/conceptual_mapping_generate_sparql_queries.py +++ /dev/null @@ -1,193 +0,0 @@ -import pathlib -import re -from typing import Iterator - -import pandas as pd - -from ted_sws.event_manager.services.log import log_cli_brief_error -from ted_sws.mapping_suite_processor import CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME, \ - CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME, RULES_FIELD_XPATH, RULES_E_FORM_BT_NAME, RULES_SF_FIELD_ID, \ - RULES_E_FORM_BT_ID, RULES_SF_FIELD_NAME -from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import ConceptualMappingReader -from ted_sws.notice_validator import BASE_XPATH_FIELD -from ted_sws.resources.prefixes import PREFIXES_DEFINITIONS - -RULES_CLASS_PATH = 'Class path (M)' -RULES_PROPERTY_PATH = 'Property path (M)' - -CL_FIELD_VALUE = 'Field Value (in XML)' -CL_MAPPING_REFERENCE = 'Mapping Reference (in ePO)' -CL_SUPER_TYPE = 'SuperType' -CL_XPATH_FRAGMENT = 'XML PATH Fragment' - -DEFAULT_RQ_NAME = 'sparql_query_' - -SPARQL_PREFIX_PATTERN = re.compile('(?:\\s+|^)([\\w\\-]+)?:') -SPARQL_PREFIX_LINE = 'PREFIX {prefix}: <{value}>' -SPARQL_LOGGER_NAME = "SPARQL" - -SPARQL_XPATH_SEPARATOR = " ;; " - - -def get_sparql_prefixes(sparql_q: str) -> list: - finds: list = re.findall(SPARQL_PREFIX_PATTERN, sparql_q) - return sorted(set(finds)) - - -def concat_field_xpath(base_xpath: str, field_xpath: str, separator: str = SPARQL_XPATH_SEPARATOR) -> str: - base_xpath = base_xpath if not pd.isna(base_xpath) else '' - field_xpath = field_xpath if not pd.isna(field_xpath) else '' - return separator.join( - [ConceptualMappingReader.xpath_with_base(xpath, base_xpath) for xpath in field_xpath.splitlines()] - ) - - -def _get_elem_reference(class_value: str, cl_dfs: dict, field_xpath: list) -> str: - if '(from ' in class_value: - - # Find CL sheet - cl_id = class_value.split()[-1][:-1] - cl_sheet: pd.DataFrame() = pd.DataFrame() - for sheet_name in cl_dfs: - if sheet_name.startswith(cl_id): - cl_sheet = cl_dfs[sheet_name] - - # Find elem type - if not cl_sheet.empty: - class_value = class_value.split()[0] - for index, row in cl_sheet.iterrows(): - class_super_type = row[CL_SUPER_TYPE] - xpath_fragment = row[CL_XPATH_FRAGMENT] - for field_xpath_fragment in reversed(field_xpath): - if class_value == class_super_type and field_xpath_fragment == xpath_fragment: - return row[CL_MAPPING_REFERENCE] - else: - return class_value - - return '' - - -def _generate_subject_type(class_path: str, cl_dfs: dict, field_xpath: str) -> str: - subject_reference = _get_elem_reference(class_path.split(' / ')[0], cl_dfs, - field_xpath.split('/') if not pd.isna(field_xpath) else '') - return f"?this rdf:type {subject_reference} ." if subject_reference else '' - - -# Could be used later -# def _generate_object_type(class_path: str, cl_dfs: dict, field_xpath: str) -> str: -# """ -# This method determines SPARQL query object type base on some rules -# :param class_path: -# :param cl_dfs: -# :param field_xpath: -# :return: -# """ -# # Temporary solution (could be used in the future) -# class_path = class_path.split(' / ')[-1] -# if 'at-voc:' in class_path: -# return '' -# -# object_reference = _get_elem_reference(class_path, cl_dfs, -# field_xpath.split('/') if not pd.isna(field_xpath) else '') -# return f"?value rdf:type {object_reference} ." if object_reference else '' - - -def sparql_validation_generator(data: pd.DataFrame, base_xpath: str, controlled_list_dfs: dict, - prefixes_definitions: dict) -> Iterator[str]: - """ - This function generates SPARQL queries based on data in the dataframe. - :param prefixes_definitions: - :param data: - :param base_xpath: - :param controlled_list_dfs: - :return: - """ - - for index, row in data.iterrows(): - sf_field_id = row[RULES_SF_FIELD_ID] - sf_field_name = row[RULES_SF_FIELD_NAME] - e_form_bt_id = row[RULES_E_FORM_BT_ID] - e_form_bt_name = row[RULES_E_FORM_BT_NAME] - field_xpath = row[RULES_FIELD_XPATH] - class_path = row[RULES_CLASS_PATH] - property_path = row[RULES_PROPERTY_PATH] - - subject_type = _generate_subject_type(class_path, controlled_list_dfs, field_xpath) \ - if '?this' in property_path else '' - - prefixes_string = property_path - if subject_type: - prefixes_string += subject_type - - sparql_title_parts = [sf_field_id, sf_field_name] - sparql_title = " - ".join([item for item in sparql_title_parts if not pd.isnull(item)]) - - prefixes = [] - for prefix in get_sparql_prefixes(prefixes_string): - if prefix in prefixes_definitions: - prefix_value = prefixes_definitions.get(prefix) - else: - # the prefix value is set to "^" on purpose, to generate a syntactically incorrect SPARQL query - prefix_value = "^" - log_cli_brief_error(f"'{sf_field_id}': PREFIX '{prefix}' is not defined.", name=SPARQL_LOGGER_NAME) - - prefixes.append(SPARQL_PREFIX_LINE.format(prefix=prefix, value=prefix_value)) - - subject_type_display = ('\n\t\t' + subject_type) if subject_type else '' - yield f"#title: {sparql_title}\n" \ - f"#description: “{sparql_title}” in SF corresponds to “{e_form_bt_id} " \ - f"{e_form_bt_name}” in eForms. The corresponding XML element is " \ - f"{concat_field_xpath(base_xpath, field_xpath)}. " \ - f"The expected ontology instances are epo: {class_path} .\n" \ - f"#xpath: {concat_field_xpath(base_xpath, field_xpath, separator=SPARQL_XPATH_SEPARATOR)}" \ - "\n" + "\n" + "\n".join(prefixes) + "\n\n" \ - f"ASK WHERE {{ " \ - f"{subject_type_display}" \ - f"\n\t\t{property_path} }}" - - -def _process_concept_mapping_sheet(sheet: pd.DataFrame) -> pd.DataFrame: - sheet.columns = sheet.iloc[0] - return sheet[1:].copy() - - -def mapping_suite_processor_generate_sparql_queries(conceptual_mappings_file_path: pathlib.Path, - output_sparql_queries_folder_path: pathlib.Path, - rq_name: str = DEFAULT_RQ_NAME, - prefixes_definitions=None): - """ - This function reads data from conceptual_mappings.xlsx and generates SPARQL validation queries in - provided package. - :param prefixes_definitions: - :param conceptual_mappings_file_path: - :param output_sparql_queries_folder_path: - :param rq_name: - :return: - """ - if prefixes_definitions is None: - prefixes_definitions = PREFIXES_DEFINITIONS - - with open(conceptual_mappings_file_path, 'rb') as excel_file: - conceptual_mappings_df = pd.read_excel(excel_file, sheet_name=None) - controlled_list_dfs = {} - for sheet_name in conceptual_mappings_df: - if sheet_name.startswith('CL'): - controlled_list_dfs[sheet_name] = _process_concept_mapping_sheet(conceptual_mappings_df[sheet_name]) - conceptual_mappings_rules_df = _process_concept_mapping_sheet( - conceptual_mappings_df[CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME]) - conceptual_mappings_rules_df[RULES_SF_FIELD_ID].ffill(axis="index", inplace=True) - conceptual_mappings_rules_df[RULES_SF_FIELD_NAME].ffill(axis="index", inplace=True) - conceptual_mappings_rules_df = conceptual_mappings_rules_df[ - conceptual_mappings_rules_df[RULES_PROPERTY_PATH].notnull()] - metadata_df = conceptual_mappings_df[CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME] - metadata = metadata_df.set_index('Field').T.to_dict('list') - base_xpath = metadata[BASE_XPATH_FIELD][0] - - sparql_queries = sparql_validation_generator(conceptual_mappings_rules_df, base_xpath, controlled_list_dfs, - prefixes_definitions) - - output_sparql_queries_folder_path.mkdir(parents=True, exist_ok=True) - for index, sparql_query in enumerate(sparql_queries): - output_file_path = output_sparql_queries_folder_path / f"{rq_name}{index}.rq" - with open(output_file_path, "w", encoding="utf-8") as output_file: - output_file.write(sparql_query) diff --git a/ted_sws/mapping_suite_processor/services/conceptual_mapping_processor.py b/ted_sws/mapping_suite_processor/services/conceptual_mapping_processor.py index 120f1193a..7b17c3a75 100644 --- a/ted_sws/mapping_suite_processor/services/conceptual_mapping_processor.py +++ b/ted_sws/mapping_suite_processor/services/conceptual_mapping_processor.py @@ -17,7 +17,6 @@ from ted_sws.mapping_suite_processor.services.mapping_suite_validation_service import validate_mapping_suite, \ get_mapping_suite_id_from_file_system -CONCEPTUAL_MAPPINGS_ASSERTIONS = "cm_assertions" SHACL_SHAPE_INJECTION_FOLDER = "ap_data_shape" SHACL_SHAPE_RESOURCES_FOLDER = "shacl_shapes" SHACL_SHAPE_FILE_NAME = "ePO_shacl_shapes.ttl" diff --git a/ted_sws/mapping_suite_processor/services/conceptual_mapping_reader.py b/ted_sws/mapping_suite_processor/services/conceptual_mapping_reader.py deleted file mode 100644 index bf12bc23f..000000000 --- a/ted_sws/mapping_suite_processor/services/conceptual_mapping_reader.py +++ /dev/null @@ -1,18 +0,0 @@ -from pathlib import Path -from typing import Dict, Union - -from ted_sws.core.model.transform import ConceptualMapping -from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import ConceptualMappingReader - - -def conceptual_mapping_read_list_from_pd_value(value): - return ConceptualMappingReader.read_list_from_pd_value(value) - - -def mapping_suite_read_metadata(conceptual_mappings_file_path: Path) -> Dict: - return ConceptualMappingReader.mapping_suite_read_metadata(conceptual_mappings_file_path) - - -def mapping_suite_read_conceptual_mapping(conceptual_mappings_file_path: Path) -> \ - Union[ConceptualMapping, None]: - return ConceptualMappingReader.mapping_suite_read_conceptual_mapping(conceptual_mappings_file_path) diff --git a/ted_sws/mapping_suite_processor/services/mapping_suite_reader.py b/ted_sws/mapping_suite_processor/services/mapping_suite_reader.py new file mode 100644 index 000000000..6e90b47f1 --- /dev/null +++ b/ted_sws/mapping_suite_processor/services/mapping_suite_reader.py @@ -0,0 +1,11 @@ +from pathlib import Path +from typing import Dict + +from ted_sws.mapping_suite_processor.adapters.mapping_suite_reader import MappingSuiteReader + +VERSION_KEY = "version" +MAPPING_SUITE_HASH = "mapping_suite_hash_digest" + + +def mapping_suite_read_metadata(mapping_suite_path: Path) -> Dict: + return MappingSuiteReader.mapping_suite_read_metadata(mapping_suite_path) diff --git a/ted_sws/notice_metadata_processor/services/notice_eligibility.py b/ted_sws/notice_metadata_processor/services/notice_eligibility.py index 3d65b5d62..7034b9c18 100644 --- a/ted_sws/notice_metadata_processor/services/notice_eligibility.py +++ b/ted_sws/notice_metadata_processor/services/notice_eligibility.py @@ -1,5 +1,5 @@ import datetime -from typing import Tuple, List, Optional +from typing import Tuple import semantic_version @@ -7,8 +7,6 @@ from ted_sws.core.model.notice import Notice from ted_sws.core.model.transform import MappingSuite, MappingSuiteType from ted_sws.data_manager.adapters.repository_abc import MappingSuiteRepositoryABC, NoticeRepositoryABC -from ted_sws.mapping_suite_processor.services.conceptual_mapping_generate_metadata import START_DATE_KEY, END_DATE_KEY, \ - MIN_XSD_VERSION_KEY, MAX_XSD_VERSION_KEY, E_FORMS_SUBTYPE_KEY, EFORMS_SDK_VERSIONS_KEY def format_version_with_zero_patch(version_string: str) -> semantic_version.Version: diff --git a/ted_sws/notice_validator/adapters/validation_summary_runner.py b/ted_sws/notice_validator/adapters/validation_summary_runner.py index 2af0e5b13..84929c785 100644 --- a/ted_sws/notice_validator/adapters/validation_summary_runner.py +++ b/ted_sws/notice_validator/adapters/validation_summary_runner.py @@ -171,7 +171,6 @@ def validation_summary(self) -> XMLManifestationValidationSummaryReport: if xpath_coverage_validation: notice_validation_result = xpath_coverage_validation.validation_result validation_result.xpath_covered += len(notice_validation_result.xpath_covered) - validation_result.xpath_not_covered += len(notice_validation_result.xpath_not_covered) return report diff --git a/ted_sws/notice_validator/adapters/xpath_coverage_runner.py b/ted_sws/notice_validator/adapters/xpath_coverage_runner.py index 6ce93e551..fc8178d0d 100644 --- a/ted_sws/notice_validator/adapters/xpath_coverage_runner.py +++ b/ted_sws/notice_validator/adapters/xpath_coverage_runner.py @@ -1,15 +1,17 @@ +import io +import xml.etree.ElementTree as ET from typing import List, Set, Dict -import numpy as np from jinja2 import Environment, PackageLoader +from saxonche import PySaxonProcessor, PySaxonApiError from ted_sws.core.model.manifestation import XPATHCoverageValidationReport, XPATHCoverageValidationAssertion, \ XPATHCoverageValidationResult from ted_sws.core.model.notice import Notice -from ted_sws.core.model.transform import ConceptualMapping, ConceptualMappingXPATH, MappingSuite +from ted_sws.core.model.transform import MappingXPATH, MappingSuite from ted_sws.core.model.validation_report import ReportNotice from ted_sws.data_sampler.services.notice_xml_indexer import index_notice -from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import ConceptualMappingReader +from ted_sws.mapping_suite_processor.adapters.mapping_suite_reader import MappingSuiteReader from ted_sws.notice_transformer.services.notice_transformer import transform_report_notices from ted_sws.notice_validator.resources.templates import TEMPLATE_METADATA_KEY @@ -24,16 +26,13 @@ class CoverageRunner: mapping_suite: MappingSuite mapping_suite_id: str conceptual_xpaths: Set[str] = set() - conceptual_remarked_xpaths: Set[str] = set() - conceptual_xpath_data: Dict[str, ConceptualMappingXPATH] = {} - base_xpath: str + conceptual_xpath_data: Dict[str, MappingXPATH] = {} def __init__(self, mapping_suite: MappingSuite): """""" self.mapping_suite = mapping_suite self.mapping_suite_id = mapping_suite.get_mongodb_id() - conceptual_mapping: ConceptualMapping = mapping_suite.conceptual_mapping - self.init_xpath_data(conceptual_mapping=conceptual_mapping) + self.init_xpath_data(mapping_suite=mapping_suite) @classmethod def notice_xpaths(cls, notice: Notice) -> List[str]: @@ -41,28 +40,22 @@ def notice_xpaths(cls, notice: Notice) -> List[str]: notice = index_notice(notice) return notice.xml_metadata.unique_xpaths - def init_xpath_data(self, conceptual_mapping: ConceptualMapping): - for cm_xpath in conceptual_mapping.mapping_remarks: - for xpath in cm_xpath.field_xpath: - self.conceptual_remarked_xpaths.add(xpath) - self.conceptual_xpath_data[xpath] = ConceptualMappingXPATH( - xpath=xpath, - form_field=f"{cm_xpath.standard_form_field_id} - {cm_xpath.standard_form_field_name}" - ) - for cm_xpath in conceptual_mapping.xpaths: + def init_xpath_data(self, mapping_suite: MappingSuite): + for cm_xpath in MappingSuiteReader.read_mapping_suite_xpaths(mapping_suite): self.conceptual_xpaths.add(cm_xpath.xpath) self.conceptual_xpath_data[cm_xpath.xpath] = cm_xpath - self.base_xpath = conceptual_mapping.metadata.base_xpath def xpath_coverage_validation_report(self, notice: Notice) -> XPATHCoverageValidationReport: report: XPATHCoverageValidationReport = XPATHCoverageValidationReport( object_data="XPATHCoverageValidationReport", mapping_suite_identifier=self.mapping_suite_id) - xpaths: List[str] = self.notice_xpaths(notice=notice) - based_xpaths = self.based_xpaths(xpaths, self.base_xpath) - notice_xpaths: XPathDict = {notice.ted_id: based_xpaths} - self.validate_xpath_coverage_report(report, notice_xpaths, based_xpaths) + xpaths: List[str] = [] + for xpath in self.get_all_conceptual_xpaths(): + if self.check_xpath_expression_with_xml(notice.xml_manifestation.object_data.encode("utf-8"), xpath): + xpaths.append(xpath) + notice_xpaths: XPathDict = {notice.ted_id: xpaths} + self.validate_xpath_coverage_report(report, notice_xpaths, xpaths) return report @@ -72,16 +65,41 @@ def find_notice_by_xpath(cls, notice_xpaths: XPathDict, xpath: str) -> Dict[str, return notice_hit def get_all_conceptual_xpaths(self) -> Set[str]: - return self.conceptual_remarked_xpaths | self.conceptual_xpaths + return self.conceptual_xpaths + + @classmethod + def extract_namespaces(cls, xml_content): + xml_file = io.StringIO(xml_content) + namespaces = dict() + for event, elem in ET.iterparse(xml_file, events=('start-ns',)): + ns, url = elem + namespaces[ns] = url + return namespaces - def xpath_assertions(self, notice_xpaths: XPathDict, - xpaths_list: List[str]) -> List[XPATHCoverageValidationAssertion]: + @classmethod + def check_xpath_expression_with_xml(cls, xml_content, xpath_expression) -> bool: + namespaces = cls.extract_namespaces(xml_content) + with PySaxonProcessor(license=False) as proc: + xp = proc.new_xpath_processor() + for prefix, ns_uri in namespaces.items(): + xp.declare_namespace(prefix, ns_uri) + document = proc.parse_xml(xml_text=xml_content) + xp.set_context(xdm_item=document) + try: + item = xp.evaluate_single(xpath_expression) + return True if item else False + except PySaxonApiError: + return False + + def xpath_assertions( + self, notice_xpaths: XPathDict, xpaths_list: List[str] + ) -> List[XPATHCoverageValidationAssertion]: xpath_assertions = [] for xpath in self.get_all_conceptual_xpaths(): xpath_assertion = XPATHCoverageValidationAssertion() xpath_data = self.conceptual_xpath_data[xpath] form_field = xpath_data.form_field - xpath_assertion.form_field = form_field if form_field is not np.nan else '' + xpath_assertion.form_field = form_field if form_field else '' xpath_assertion.xpath = xpath xpath_assertion.count = xpaths_list.count(xpath) xpath_assertion.notice_hit = self.find_notice_by_xpath(notice_xpaths, xpath) @@ -89,40 +107,20 @@ def xpath_assertions(self, notice_xpaths: XPathDict, xpath_assertions.append(xpath_assertion) return sorted(xpath_assertions, key=lambda x: x.form_field) - def validate_xpath_coverage_report(self, report: XPATHCoverageValidationReport, notice_xpaths: XPathDict, - xpaths_list: List[str]): + def validate_xpath_coverage_report( + self, report: XPATHCoverageValidationReport, notice_xpaths: XPathDict, xpaths_list: List[str] + ): unique_notice_xpaths: Set[str] = set(xpaths_list) validation_result: XPATHCoverageValidationResult = XPATHCoverageValidationResult() validation_result.xpath_assertions = self.xpath_assertions(notice_xpaths, xpaths_list) validation_result.xpath_covered = sorted(list(self.conceptual_xpaths & unique_notice_xpaths)) - all_conceptual_xpaths = self.get_all_conceptual_xpaths() - validation_result.xpath_not_covered = sorted(list(unique_notice_xpaths - all_conceptual_xpaths)) - validation_result.xpath_extra = sorted(list(all_conceptual_xpaths - unique_notice_xpaths)) - validation_result.remarked_xpaths = sorted(list(self.conceptual_remarked_xpaths)) - unique_notice_xpaths_len = len(unique_notice_xpaths) - xpath_covered_len = len(validation_result.xpath_covered) - conceptual_xpaths_len = len(self.conceptual_xpaths) - if unique_notice_xpaths_len: - validation_result.coverage = xpath_covered_len / unique_notice_xpaths_len - if conceptual_xpaths_len: - validation_result.conceptual_coverage = xpath_covered_len / conceptual_xpaths_len report.validation_result = validation_result - @classmethod - def based_xpaths(cls, xpaths: List[str], base_xpath: str) -> List[str]: - """ - :param xpaths: - :param base_xpath: - :return: - """ - base_xpath = ConceptualMappingReader.base_xpath_as_prefix(base_xpath) - return list(filter(lambda xpath: xpath.startswith(base_xpath), xpaths)) - - def xpath_coverage_validation_summary_report(self, - notices: List[ReportNotice] - ) -> XPATHCoverageValidationReport: + def xpath_coverage_validation_summary_report( + self, notices: List[ReportNotice] + ) -> XPATHCoverageValidationReport: report: XPATHCoverageValidationReport = XPATHCoverageValidationReport( object_data="XPATHCoverageValidationReport", mapping_suite_identifier=self.mapping_suite_id) @@ -131,9 +129,14 @@ def xpath_coverage_validation_summary_report(self, xpaths_list: List[str] = [] for report_notice in notices: notice = report_notice.notice - xpaths: List[str] = self.notice_xpaths(notice=notice) - - notice_xpaths[notice.ted_id] = self.based_xpaths(xpaths, self.base_xpath) + xpaths: List[str] = [] + for xpath in self.get_all_conceptual_xpaths(): + if self.check_xpath_expression_with_xml( + report_notice.notice.xml_manifestation.object_data.encode("utf-8"), xpath + ): + xpaths.append(xpath) + + notice_xpaths[notice.ted_id] = xpaths xpaths_list += notice_xpaths[notice.ted_id] self.validate_xpath_coverage_report(report, notice_xpaths, xpaths_list) diff --git a/ted_sws/notice_validator/resources/templates/validation_summary_report.jinja2 b/ted_sws/notice_validator/resources/templates/validation_summary_report.jinja2 index 84b58216b..e5b3e45c2 100644 --- a/ted_sws/notice_validator/resources/templates/validation_summary_report.jinja2 +++ b/ted_sws/notice_validator/resources/templates/validation_summary_report.jinja2 @@ -109,7 +109,6 @@ <ul> <li>Mapping suite identifier: {{ xml_manifestation.xpath_coverage_summary.mapping_suite_identifier }}</li> <li>XPATHs covered: {{ xml_manifestation.xpath_coverage_summary.validation_result.xpath_covered }}</li> - <li>XPATHs not covered: {{ xml_manifestation.xpath_coverage_summary.validation_result.xpath_not_covered }}</li> </ul> <hr> diff --git a/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 b/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 index 8f74a98d0..250142bc2 100644 --- a/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 +++ b/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 @@ -88,16 +88,6 @@ {% if template_metadata.grouping %} <li><b>Grouping:</b> {{ template_metadata.grouping }}</li> {% endif %} - <li> - <div><hr></div> - <h4>Overall coverage</h4> - <h3>{{ "%.2f"|format((validation_result.coverage|float * 100)) }}%</h3> - <p>(Number of Conceptual XPATHs found in notices) / (Total number of unique XPATHs in provided notices)</p> - <li><div><hr></div> - <h4>Conceptual coverage</h4> - <h3>{{ "%.2f"|format((validation_result.conceptual_coverage|float * 100)) }}%</h3> - <p>(Number of Conceptual XPATHs found in notices) / (Total number of unique Conceptual XPATHs)</p> - </li> {% set nr_notices = notices|length %} <li><div><hr></div> {% if nr_notices > 1 %} @@ -150,78 +140,7 @@ </tbody> </table> {% endif %} -{% if validation_result.xpath_covered|length > 0 %} -<hr> -<h2>XPATHs covered in the "Rules" of Conceptual Mapping</h2> -<table class="display summary" data-order='[[0, "asc"]]'> - <thead> - <tr> - <th>XPATH</th> - </tr> - </thead> - <tbody> - {% for xpath in validation_result.xpath_covered %} - <tr> - <td class="break-word">{{ xpath }}</td> - </tr> - {% endfor %} - </tbody> -</table> -{% endif %} -{% if validation_result.remarked_xpaths|length > 0 %} - <hr> - <h2>XPATHs covered in the "Mapping Remarks" of Conceptual Mapping</h2> - <table class="display summary" data-order='[[0, "asc"]]'> - <thead> - <tr> - <th>XPATH</th> - </tr> - </thead> - <tbody> - {% for xpath in validation_result.remarked_xpaths %} - <tr> - <td class="break-word">{{ xpath }}</td> - </tr> - {% endfor %} - </tbody> - </table> -{% endif %} -{% if validation_result.xpath_not_covered|length > 0 %} -<hr> -<h2>XPATHs not covered by Conceptual Mapping</h2> -<table class="display summary" data-order='[[0, "asc"]]'> - <thead> - <tr> - <th>XPATH</th> - </tr> - </thead> - <tbody> - {% for xpath in validation_result.xpath_not_covered %} - <tr> - <td class="break-word">{{ xpath }}</td> - </tr> - {% endfor %} - </tbody> -</table> -{% endif %} -{% if validation_result.xpath_extra|length > 0 %} -<hr> -<h2>Extra XPATHs in Conceptual Mapping</h2> -<table class="display summary" data-order='[[0, "asc"]]'> - <thead> - <tr> - <th>XPATH</th> - </tr> - </thead> - <tbody> - {% for xpath in validation_result.xpath_extra %} - <tr> - <td class="break-word">{{ xpath }}</td> - </tr> - {% endfor %} - </tbody> -</table> -{% endif %} + <hr> </body> </html> diff --git a/ted_sws/notice_validator/services/sparql_test_suite_runner.py b/ted_sws/notice_validator/services/sparql_test_suite_runner.py index d3124746a..a7909d878 100644 --- a/ted_sws/notice_validator/services/sparql_test_suite_runner.py +++ b/ted_sws/notice_validator/services/sparql_test_suite_runner.py @@ -1,6 +1,6 @@ import re from pathlib import Path -from typing import Tuple, List +from typing import List from jinja2 import Environment, PackageLoader @@ -12,7 +12,8 @@ ReportNotice from ted_sws.core.model.validation_report_data import ReportPackageNoticeData from ted_sws.data_manager.adapters.repository_abc import NoticeRepositoryABC, MappingSuiteRepositoryABC -from ted_sws.mapping_suite_processor.services.conceptual_mapping_generate_sparql_queries import SPARQL_XPATH_SEPARATOR +from ted_sws.mapping_suite_processor.adapters.mapping_suite_reader import MappingSuiteReader, \ + SPARQL_QUERY_METADATA_TITLE, SPARQL_QUERY_METADATA_DESCRIPTION, SPARQL_QUERY_METADATA_XPATH from ted_sws.notice_transformer.adapters.notice_transformer import NoticeTransformer from ted_sws.notice_validator.adapters.sparql_runner import SPARQLRunner from ted_sws.notice_validator.resources.templates import TEMPLATE_METADATA_KEY @@ -22,13 +23,12 @@ SPARQL_TEST_SUITE_EXECUTION_HTML_REPORT_TEMPLATE = "sparql_query_results_report.jinja2" SPARQL_SUMMARY_HTML_REPORT_TEMPLATE = "sparql_summary_report.jinja2" -QUERY_METADATA_TITLE = "title" -QUERY_METADATA_DESCRIPTION = "description" -QUERY_METADATA_XPATH = "xpath" DEFAULT_QUERY_TITLE = "untitled query" DEFAULT_QUERY_DESCRIPTION = "un-described query" DEFAULT_QUERY_XPATH = [] +SPARQL_XPATH_SEPARATOR = " ;; " + class SPARQLTestSuiteRunner: """ @@ -54,14 +54,14 @@ def _sparql_query_from_file_resource(cls, file_resource: FileResource) -> SPARQL :param file_resource: :return: """ - metadata = extract_metadata_from_sparql_query(file_resource.file_content) - title = metadata[QUERY_METADATA_TITLE] \ - if QUERY_METADATA_TITLE in metadata else DEFAULT_QUERY_TITLE - description = metadata[QUERY_METADATA_DESCRIPTION] \ - if QUERY_METADATA_DESCRIPTION in metadata else DEFAULT_QUERY_DESCRIPTION - xpath = metadata[QUERY_METADATA_XPATH].split( + metadata = MappingSuiteReader.extract_metadata_from_sparql_query(file_resource.file_content) + title = metadata[SPARQL_QUERY_METADATA_TITLE] \ + if SPARQL_QUERY_METADATA_TITLE in metadata else DEFAULT_QUERY_TITLE + description = metadata[SPARQL_QUERY_METADATA_DESCRIPTION] \ + if SPARQL_QUERY_METADATA_DESCRIPTION in metadata else DEFAULT_QUERY_DESCRIPTION + xpath = metadata[SPARQL_QUERY_METADATA_XPATH].split( SPARQL_XPATH_SEPARATOR - ) if QUERY_METADATA_XPATH in metadata and metadata[QUERY_METADATA_XPATH] else DEFAULT_QUERY_XPATH + ) if SPARQL_QUERY_METADATA_XPATH in metadata and metadata[SPARQL_QUERY_METADATA_XPATH] else DEFAULT_QUERY_XPATH query = cls._sanitize_query(file_resource.file_content) return SPARQLQuery(title=title, description=description, xpath=xpath, query=query) @@ -349,19 +349,3 @@ def validate_notice_by_id_with_sparql_suite(notice_id: str, mapping_suite_identi raise ValueError(f'Mapping suite package, with {mapping_suite_identifier} id, was not found') validate_notice_with_sparql_suite(notice=notice, mapping_suite_package=mapping_suite_package, with_html=with_html) notice_repository.update(notice=notice) - - -def extract_metadata_from_sparql_query(content) -> dict: - """ - Extracts a dictionary of metadata from a SPARQL query - """ - - def _process_line(line) -> Tuple[str, str]: - if ":" in line: - key_part, value_part = line.split(":", 1) - key_part = key_part.replace("#", "").strip() - value_part = value_part.strip() - return key_part, value_part - - content_lines_with_comments = filter(lambda x: x.strip().startswith("#"), content.splitlines()) - return dict([_process_line(line) for line in content_lines_with_comments]) diff --git a/tests/e2e/mapping_suite_processor/conceptual_mapping_differ/conftest.py b/tests/e2e/mapping_suite_processor/conceptual_mapping_differ/conftest.py deleted file mode 100644 index 8af3ed218..000000000 --- a/tests/e2e/mapping_suite_processor/conceptual_mapping_differ/conftest.py +++ /dev/null @@ -1,13 +0,0 @@ -import pytest - -from tests import TEST_DATA_PATH - - -@pytest.fixture -def file_system_repository_path(): - return TEST_DATA_PATH / "notice_transformer" / "mapping_suite_processor_repository" - - -@pytest.fixture -def fake_test_mapping_suite_id() -> str: - return "test_package" diff --git a/tests/e2e/mapping_suite_processor/conceptual_mapping_differ/test_conceptual_mapping_differ.py b/tests/e2e/mapping_suite_processor/conceptual_mapping_differ/test_conceptual_mapping_differ.py deleted file mode 100644 index cc8d4d270..000000000 --- a/tests/e2e/mapping_suite_processor/conceptual_mapping_differ/test_conceptual_mapping_differ.py +++ /dev/null @@ -1,67 +0,0 @@ -from ted_sws.core.model.transform import ConceptualMapping, ConceptualMappingMetadata, ConceptualMappingDiff -from ted_sws.data_manager.adapters.mapping_suite_repository import MS_TRANSFORM_FOLDER_NAME, \ - MS_CONCEPTUAL_MAPPING_FILE_NAME -from ted_sws.mapping_suite_processor.services.conceptual_mapping_differ import mapping_suite_diff_conceptual_mappings, \ - mapping_suite_diff_files_conceptual_mappings, mapping_suite_diff_repo_conceptual_mappings, \ - generate_conceptual_mappings_diff_filename - - -def test_mapping_suite_diff_conceptual_mappings(): - mapping1: ConceptualMapping = ConceptualMapping() - metadata1: ConceptualMappingMetadata = ConceptualMappingMetadata() - metadata1.base_xpath = "BASE1" - mapping1.metadata = metadata1 - mapping2: ConceptualMapping = ConceptualMapping() - metadata2: ConceptualMappingMetadata = ConceptualMappingMetadata() - metadata2.base_xpath = "BASE2" - mapping2.metadata = metadata2 - - assert mapping_suite_diff_conceptual_mappings([mapping1, mapping2]) - - mapping2.metadata = metadata1 - - assert not mapping_suite_diff_conceptual_mappings([mapping1, mapping2])['data']['html'] - - -def test_mapping_suite_diff_file_conceptual_mappings(package_folder_path, package_F03_folder_path): - """""" - filepath1 = package_folder_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME - filepath2 = package_F03_folder_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME - - diff = mapping_suite_diff_files_conceptual_mappings([filepath1, filepath2]) - assert diff['metadata'] - assert diff['data']['original'] - - -def test_mapping_suite_diff_repo_conceptual_mappings(github_mapping_suite_id, package_folder_path): - """""" - - diff = mapping_suite_diff_repo_conceptual_mappings( - branch_or_tag_name=["main"], - mapping_suite_id=[github_mapping_suite_id], - ) - assert not diff['data']['original'] - - diff = mapping_suite_diff_repo_conceptual_mappings( - branch_or_tag_name=["main"], - mapping_suite_id=[github_mapping_suite_id], - filepath=package_folder_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME - ) - assert diff['data']['original'] - - diff = mapping_suite_diff_repo_conceptual_mappings( - branch_or_tag_name=["main"], - mapping_suite_id=[github_mapping_suite_id, "package_F06"] - ) - assert diff['data']['original'] - - -def test_generate_conceptual_mappings_diff_filename(package_folder_path, package_F03_folder_path): - """""" - filepath1 = package_folder_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME - filepath2 = package_F03_folder_path / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME - - diff = mapping_suite_diff_files_conceptual_mappings([filepath1, filepath2]) - filename = generate_conceptual_mappings_diff_filename(ConceptualMappingDiff(**diff), "conceptual_mappings_diff", - ".json") - assert filename == "conceptual_mappings_diff_F03_v0.0.1_vs_package_F03_v2.2.0.json" diff --git a/tests/unit/mapping_suite_processor/test_conceptual_mapping_reader.py b/tests/unit/mapping_suite_processor/test_conceptual_mapping_reader.py deleted file mode 100644 index aad9e333f..000000000 --- a/tests/unit/mapping_suite_processor/test_conceptual_mapping_reader.py +++ /dev/null @@ -1,24 +0,0 @@ -from pathlib import Path - -from ted_sws.mapping_suite_processor.services.conceptual_mapping_reader import mapping_suite_read_conceptual_mapping, \ - conceptual_mapping_read_list_from_pd_value -from tests import temporary_copy - - -def test__read_list_from_pd_value(): - assert conceptual_mapping_read_list_from_pd_value(None) == [] - - -def test_mapping_suite_read_conceptual_mapping(file_system_repository_path, mongodb_client): - with temporary_copy(file_system_repository_path / "test_package") as tmp_mapping_suite_package_path: - conceptual_mappings_folder = Path(tmp_mapping_suite_package_path) / "transformation" - - conceptual_mapping = mapping_suite_read_conceptual_mapping( - conceptual_mappings_folder / "conceptual_mappings.xlsx") - - assert conceptual_mapping - - conceptual_mapping = mapping_suite_read_conceptual_mapping( - conceptual_mappings_folder / "non_existing_conceptual_mappings.xlsx") - - assert not conceptual_mapping diff --git a/tests/unit/mapping_suite_processor/test_conceptual_mapping_processor.py b/tests/unit/mapping_suite_processor/test_mapping_suite_processor.py similarity index 52% rename from tests/unit/mapping_suite_processor/test_conceptual_mapping_processor.py rename to tests/unit/mapping_suite_processor/test_mapping_suite_processor.py index b20e8c6ff..10a885103 100644 --- a/tests/unit/mapping_suite_processor/test_conceptual_mapping_processor.py +++ b/tests/unit/mapping_suite_processor/test_mapping_suite_processor.py @@ -1,10 +1,5 @@ -import json -import os - from ted_sws.data_manager.adapters.mapping_suite_repository import MappingSuiteRepositoryInFileSystem, \ - MappingSuiteRepositoryMongoDB, MS_METADATA_FILE_NAME -from ted_sws.mapping_suite_processor.services.conceptual_mapping_generate_metadata import \ - mapping_suite_processor_generate_metadata, MAPPING_SUITE_HASH + MappingSuiteRepositoryMongoDB from ted_sws.mapping_suite_processor.services.conceptual_mapping_processor import \ mapping_suite_processor_load_package_in_mongo_db from tests import temporary_copy @@ -25,23 +20,3 @@ def test_mapping_suite_processor_upload_in_mongodb(file_system_repository_path, assert mapping_suite mongodb_client.drop_database(aggregates_database_name) - - -def test_mapping_suite_processor_generate_metadata(file_system_repository_path): - with temporary_copy(file_system_repository_path) as tmp_mapping_suite_package_path: - mapping_suite_package_path = tmp_mapping_suite_package_path / "test_package" - metadata_file_path = (mapping_suite_package_path / MS_METADATA_FILE_NAME) - os.remove(metadata_file_path) - assert not metadata_file_path.is_file() - mapping_suite_processor_generate_metadata(mapping_suite_path=mapping_suite_package_path) - assert metadata_file_path.is_file() - metadata = json.loads(metadata_file_path.read_text()) - assert MAPPING_SUITE_HASH in metadata - - output_metadata_file_path = mapping_suite_package_path / "other_metadata.json" - assert not output_metadata_file_path.is_file() - mapping_suite_processor_generate_metadata( - mapping_suite_path=mapping_suite_package_path, - output_metadata_file_path=output_metadata_file_path - ) - assert output_metadata_file_path.is_file() diff --git a/tests/unit/mapping_suite_processor/test_mapping_suite_structure_checker.py b/tests/unit/mapping_suite_processor/test_mapping_suite_structure_checker.py index b4d490636..2feed89a7 100644 --- a/tests/unit/mapping_suite_processor/test_mapping_suite_structure_checker.py +++ b/tests/unit/mapping_suite_processor/test_mapping_suite_structure_checker.py @@ -9,7 +9,6 @@ MS_CONCEPTUAL_MAPPING_FILE_NAME, MS_TEST_SUITE_REPORT, MS_MAPPINGS_FOLDER_NAME from ted_sws.mapping_suite_processor.adapters.mapping_suite_structure_checker import \ MS_METADATA_FILE_NAME, MappingSuiteStructureValidator -from ted_sws.mapping_suite_processor.services.conceptual_mapping_reader import mapping_suite_read_metadata KEY_VERSION = "Mapping Version" KEY_EPO = "EPO version" @@ -74,27 +73,6 @@ def test_validate_output_structure(caplog, package_folder_path_for_validator): assert MS_OUTPUT_FOLDER_NAME in caplog.text -def test_check_metadata_consistency(caplog, package_folder_path_for_validator): - with tempfile.TemporaryDirectory() as temp_folder: - shutil.copytree(package_folder_path_for_validator, temp_folder, dirs_exist_ok=True) - mapping_suite_validator = MappingSuiteStructureValidator(temp_folder) - assert mapping_suite_validator.check_metadata_consistency() - assert not mapping_suite_validator.check_metadata_consistency( - package_metadata_path=(mapping_suite_validator.mapping_suite_path / "metadata_invalid.json") - ) - assert "ERROR" in caplog.text - assert "Not the same value between metadata.json" in caplog.text - conceptual_mappings_file_path = ( - pathlib.Path(temp_folder) / MS_TRANSFORM_FOLDER_NAME / MS_CONCEPTUAL_MAPPING_FILE_NAME) - conceptual_mappings_file = pathlib.Path(conceptual_mappings_file_path) - assert conceptual_mappings_file.exists() - metadata_file = pathlib.Path(package_folder_path_for_validator / MS_METADATA_FILE_NAME) - assert metadata_file.exists() - mapping_version = mapping_suite_read_metadata(conceptual_mappings_file_path=conceptual_mappings_file_path) - assert KEY_VERSION in mapping_version - assert KEY_EPO in mapping_version - - def test_check_for_changes_by_version(caplog, package_folder_path_for_validator): with tempfile.TemporaryDirectory() as temp_folder: shutil.copytree(package_folder_path_for_validator, temp_folder, dirs_exist_ok=True) diff --git a/tests/unit/notice_validator/test_sparql_test_suite_runner.py b/tests/unit/notice_validator/test_sparql_test_suite_runner.py index b62d8d9ab..28a61a134 100644 --- a/tests/unit/notice_validator/test_sparql_test_suite_runner.py +++ b/tests/unit/notice_validator/test_sparql_test_suite_runner.py @@ -5,6 +5,7 @@ from ted_sws.core.model.notice import NoticeStatus from ted_sws.core.model.validation_report import ReportNotice, SPARQLValidationSummaryReport from ted_sws.data_manager.adapters.mapping_suite_repository import MappingSuiteRepositoryInFileSystem +from ted_sws.mapping_suite_processor.adapters.mapping_suite_reader import MappingSuiteReader from ted_sws.notice_validator.services.sparql_test_suite_runner import SPARQLTestSuiteRunner, SPARQLReportBuilder, \ validate_notice_with_sparql_suite, validate_notice_by_id_with_sparql_suite, extract_metadata_from_sparql_query, \ generate_sparql_validation_summary_report @@ -144,18 +145,18 @@ def test_validate_notice_by_id_with_sparql_suite(notice_with_distilled_status, r def test_get_metadata_from_freaking_sparql_queries(query_content, query_content_without_description, query_content_with_xpath): - metadata = extract_metadata_from_sparql_query(query_content) + metadata = MappingSuiteReader.extract_metadata_from_sparql_query(query_content) assert metadata["title"] assert metadata["description"] assert "SELECT" not in metadata - metadata = extract_metadata_from_sparql_query(query_content_with_xpath) + metadata = MappingSuiteReader.extract_metadata_from_sparql_query(query_content_with_xpath) assert metadata["title"] assert metadata["description"] assert metadata["xpath"] assert "PREFIX" not in metadata - metadata = extract_metadata_from_sparql_query(query_content_without_description) + metadata = MappingSuiteReader.extract_metadata_from_sparql_query(query_content_without_description) assert metadata["title"] assert "description" not in metadata From 9e66d6b5bea994a9388283517304f8b3b008944d Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 01:00:48 +0300 Subject: [PATCH 02/13] updates xpath validator --- .../adapters/mapping_suite_reader.py | 5 +- ...ceptual_mapping_generate_sparql_queries.py | 50 ------------------- .../test_sparql_test_suite_runner.py | 2 +- 3 files changed, 4 insertions(+), 53 deletions(-) delete mode 100644 tests/unit/mapping_suite_processor/services/test_conceptual_mapping_generate_sparql_queries.py diff --git a/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py b/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py index c635cee6e..c1080b0c5 100644 --- a/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py +++ b/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py @@ -3,8 +3,7 @@ from typing import Dict, List, Tuple from ted_sws.core.model.transform import MappingXPATH, MappingSuite -from ted_sws.data_manager.adapters.mapping_suite_repository import MS_METADATA_FILE_NAME, MS_VALIDATE_FOLDER_NAME, \ - MS_SPARQL_FOLDER_NAME +from ted_sws.data_manager.adapters.mapping_suite_repository import MS_METADATA_FILE_NAME # This set of constants refers to fields in the Conceptual Mapping file VERSION_FIELD = 'Mapping Version' @@ -15,6 +14,8 @@ SPARQL_QUERY_METADATA_DESCRIPTION = "description" SPARQL_QUERY_METADATA_XPATH = "xpath" +FILE_NAME_KEY = "File name" +REF_INTEGRATION_TESTS_KEY = "Reference to Integration Tests (O)" class MappingSuiteReader: """ diff --git a/tests/unit/mapping_suite_processor/services/test_conceptual_mapping_generate_sparql_queries.py b/tests/unit/mapping_suite_processor/services/test_conceptual_mapping_generate_sparql_queries.py deleted file mode 100644 index 9b348edd9..000000000 --- a/tests/unit/mapping_suite_processor/services/test_conceptual_mapping_generate_sparql_queries.py +++ /dev/null @@ -1,50 +0,0 @@ -import shutil -import tempfile -from pathlib import Path - -from ted_sws.data_manager.adapters.mapping_suite_repository import MS_TRANSFORM_FOLDER_NAME, \ - MS_CONCEPTUAL_MAPPING_FILE_NAME -from ted_sws.data_manager.adapters.mapping_suite_repository import MS_VALIDATE_FOLDER_NAME, \ - MS_SPARQL_FOLDER_NAME -from ted_sws.mapping_suite_processor.services.conceptual_mapping_generate_sparql_queries import \ - mapping_suite_processor_generate_sparql_queries - -CONCEPTUAL_MAPPINGS_FILE_TEMPLATE = '{mappings_path}/{mapping_suite_id}/' + MS_TRANSFORM_FOLDER_NAME + '/' \ - + MS_CONCEPTUAL_MAPPING_FILE_NAME -MAPPING_SUITE_FILE_TEMPLATE = '{mappings_path}/{mapping_suite_id}' -DEFAULT_OUTPUT_SPARQL_QUERIES_FOLDER = '{mappings_path}/{mapping_suite_id}/' + MS_VALIDATE_FOLDER_NAME + '/' + \ - MS_SPARQL_FOLDER_NAME + '/cm_assertions' - - -def test_mapping_suite_processor_generate_sparql_queries(caplog, fake_mapping_suite_id, file_system_repository_path): - with tempfile.TemporaryDirectory() as temp_folder: - temp_mapping_suite_path = Path(temp_folder) - shutil.copytree(file_system_repository_path, temp_mapping_suite_path, dirs_exist_ok=True) - - conceptual_mappings_file_path = Path(CONCEPTUAL_MAPPINGS_FILE_TEMPLATE.format( - mappings_path=temp_mapping_suite_path, - mapping_suite_id=fake_mapping_suite_id - )) - output_sparql_queries_folder_path = Path(DEFAULT_OUTPUT_SPARQL_QUERIES_FOLDER.format( - mappings_path=temp_mapping_suite_path, - mapping_suite_id=fake_mapping_suite_id - )) - mapping_suite_processor_generate_sparql_queries( - conceptual_mappings_file_path=conceptual_mappings_file_path, - output_sparql_queries_folder_path=output_sparql_queries_folder_path - ) - assert output_sparql_queries_folder_path.is_dir() - assert any(output_sparql_queries_folder_path.iterdir()) - assert "ERROR" not in caplog.text - - mapping_suite_processor_generate_sparql_queries( - conceptual_mappings_file_path=conceptual_mappings_file_path, - output_sparql_queries_folder_path=output_sparql_queries_folder_path, - prefixes_definitions={ - "test": "https://test" - } - ) - assert output_sparql_queries_folder_path.is_dir() - assert any(output_sparql_queries_folder_path.iterdir()) - assert "ERROR" in caplog.text - assert "is not defined" in caplog.text \ No newline at end of file diff --git a/tests/unit/notice_validator/test_sparql_test_suite_runner.py b/tests/unit/notice_validator/test_sparql_test_suite_runner.py index 28a61a134..3f76b4bc5 100644 --- a/tests/unit/notice_validator/test_sparql_test_suite_runner.py +++ b/tests/unit/notice_validator/test_sparql_test_suite_runner.py @@ -7,7 +7,7 @@ from ted_sws.data_manager.adapters.mapping_suite_repository import MappingSuiteRepositoryInFileSystem from ted_sws.mapping_suite_processor.adapters.mapping_suite_reader import MappingSuiteReader from ted_sws.notice_validator.services.sparql_test_suite_runner import SPARQLTestSuiteRunner, SPARQLReportBuilder, \ - validate_notice_with_sparql_suite, validate_notice_by_id_with_sparql_suite, extract_metadata_from_sparql_query, \ + validate_notice_with_sparql_suite, validate_notice_by_id_with_sparql_suite, \ generate_sparql_validation_summary_report From 3d528c50bd4a53107ec10d0d2df4f50aee7a997a Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 01:12:07 +0300 Subject: [PATCH 03/13] updates xpath validator --- tests/test_data/package_F03_demo/metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data/package_F03_demo/metadata.json b/tests/test_data/package_F03_demo/metadata.json index 5a0c4cd71..bade0c73b 100644 --- a/tests/test_data/package_F03_demo/metadata.json +++ b/tests/test_data/package_F03_demo/metadata.json @@ -22,5 +22,5 @@ ] } }, - "mapping_suite_hash_digest": "9a525927676d556f43d4bc053f9f9ac042f5d4ff1a80a0f83b30451ed7013e7d" + "mapping_suite_hash_digest": "2d649620fd7a85978a94e4834505f31fb0e678b5a21873450565eea1e5d3e313" } \ No newline at end of file From b9cd03973e6e174df1e225b4b8c6021a957969b3 Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 01:18:04 +0300 Subject: [PATCH 04/13] updates xpath validator --- .../adapters/mapping_suite_structure_checker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ted_sws/mapping_suite_processor/adapters/mapping_suite_structure_checker.py b/ted_sws/mapping_suite_processor/adapters/mapping_suite_structure_checker.py index d2dfb6580..c028983c7 100644 --- a/ted_sws/mapping_suite_processor/adapters/mapping_suite_structure_checker.py +++ b/ted_sws/mapping_suite_processor/adapters/mapping_suite_structure_checker.py @@ -146,6 +146,7 @@ def check_for_changes_by_version(self) -> bool: metadata = mapping_suite_read_metadata(mapping_suite_path=self.mapping_suite_path) version = metadata.get(VERSION_KEY) + mapping_suite_versioned_hash = MappingSuiteHasher(self.mapping_suite_path).hash_mapping_suite( with_version=version) From 91fe65def9487690dec38cbb1bfda3dd5aebdd43 Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 01:39:00 +0300 Subject: [PATCH 05/13] updates xpath validator --- .../adapters/mapping_suite_reader.py | 2 +- .../adapters/xpath_coverage_runner.py | 4 ++-- .../validation/sparql/cm_assertions/sparql_query_0.rq | 10 ++++++++++ .../validation/sparql/cm_assertions/sparql_query_1.rq | 11 +++++++++++ .../validation/sparql/cm_assertions/sparql_query_3.rq | 10 ++++++++++ .../notice_validator/test_xpath_coverage_runner.py | 1 + 6 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_0.rq create mode 100644 tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_1.rq create mode 100644 tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_3.rq diff --git a/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py b/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py index c1080b0c5..0b26f84b5 100644 --- a/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py +++ b/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py @@ -62,7 +62,7 @@ def read_mapping_suite_xpaths(cls, mapping_suite: MappingSuite) -> List[MappingX processed_xpaths = set() for test_suite in mapping_suite.sparql_test_suites: - if test_suite != CONCEPTUAL_MAPPINGS_ASSERTIONS: + if test_suite.identifier != CONCEPTUAL_MAPPINGS_ASSERTIONS: continue for sparql_test in test_suite.sparql_tests: diff --git a/ted_sws/notice_validator/adapters/xpath_coverage_runner.py b/ted_sws/notice_validator/adapters/xpath_coverage_runner.py index fc8178d0d..fb3905454 100644 --- a/ted_sws/notice_validator/adapters/xpath_coverage_runner.py +++ b/ted_sws/notice_validator/adapters/xpath_coverage_runner.py @@ -52,7 +52,7 @@ def xpath_coverage_validation_report(self, notice: Notice) -> XPATHCoverageValid xpaths: List[str] = [] for xpath in self.get_all_conceptual_xpaths(): - if self.check_xpath_expression_with_xml(notice.xml_manifestation.object_data.encode("utf-8"), xpath): + if self.check_xpath_expression_with_xml(notice.xml_manifestation.object_data, xpath): xpaths.append(xpath) notice_xpaths: XPathDict = {notice.ted_id: xpaths} self.validate_xpath_coverage_report(report, notice_xpaths, xpaths) @@ -132,7 +132,7 @@ def xpath_coverage_validation_summary_report( xpaths: List[str] = [] for xpath in self.get_all_conceptual_xpaths(): if self.check_xpath_expression_with_xml( - report_notice.notice.xml_manifestation.object_data.encode("utf-8"), xpath + report_notice.notice.xml_manifestation.object_data, xpath ): xpaths.append(xpath) diff --git a/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_0.rq b/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_0.rq new file mode 100644 index 000000000..bc3f0c017 --- /dev/null +++ b/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_0.rq @@ -0,0 +1,10 @@ +#title: BT-701-notice BT-701-notice - BT-701-notice +#description: “BT-701-notice BT-701-notice - BT-701-notice” in SF corresponds to “BT-701 Notice Identifier” in eForms. The corresponding XML element is /*/cbc:ID[@schemeName='notice-id']. The expected ontology instances are epo: epo:Notice / epo:Indentifier / rdf:langString . +#xpath: /*/cbc:ID[@schemeName='notice-id'] + +PREFIX epo: <http://data.europa.eu/a4g/ontology#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + +ASK WHERE { + ?this rdf:type epo:Notice . + ?this epo:hasID / epo:hasIdentifierValue ?value } \ No newline at end of file diff --git a/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_1.rq b/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_1.rq new file mode 100644 index 000000000..00457353c --- /dev/null +++ b/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_1.rq @@ -0,0 +1,11 @@ +#title: ND-ContractingParty ND-ContractingParty - ND-ContractingParty +#description: “ND-ContractingParty ND-ContractingParty - ND-ContractingParty” in SF corresponds to “nan nan” in eForms. The corresponding XML element is /*/cac:ContractingParty. The expected ontology instances are epo: epo:Notice / epo:AgentInRole (from CL1) . +#xpath: /*/cac:ContractingParty + +PREFIX epo: <http://data.europa.eu/a4g/ontology#> +PREFIX epo-not: <http://data.europa.eu/a4g/ontology#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + +ASK WHERE { + ?this rdf:type epo:Notice . + ?this epo-not:announcesRole ?value } \ No newline at end of file diff --git a/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_3.rq b/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_3.rq new file mode 100644 index 000000000..6138e7ce5 --- /dev/null +++ b/tests/test_data/notice_validator/test_repository/test_package_F03/validation/sparql/cm_assertions/sparql_query_3.rq @@ -0,0 +1,10 @@ +#title: ND-ServiceProvider ND-ContractingParty.ND-ServiceProvider - ND-ServiceProvider +#description: “ND-ServiceProvider ND-ContractingParty.ND-ServiceProvider - ND-ServiceProvider” in SF corresponds to “nan nan” in eForms. The corresponding XML element is /*/cac:ContractingParty/cac:Party. The expected ontology instances are epo: org:Organization . +#xpath: /TED_EXPORT/FORM_SECTION/F03_2014/CONTRACTING_BODY/ADDRESS_CONTRACTING_BODY/OFFICIALNAME + +PREFIX org: <http://www.w3.org/ns/org#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + +ASK WHERE { + ?this rdf:type org:Organization . + ?this a org:Organization. } \ No newline at end of file diff --git a/tests/unit/notice_validator/test_xpath_coverage_runner.py b/tests/unit/notice_validator/test_xpath_coverage_runner.py index 04a8f1fb4..40e98e2d0 100644 --- a/tests/unit/notice_validator/test_xpath_coverage_runner.py +++ b/tests/unit/notice_validator/test_xpath_coverage_runner.py @@ -18,6 +18,7 @@ def test_xpath_coverage_runner(fake_notice_F03, fake_mapping_suite_F03_id, fake_ assert "mapping_suite_identifier" in json_report assert "validation_result" in json_report assert "xpath_assertions" in json_report["validation_result"] + assert "xpath_covered" in json_report["validation_result"] assert xpath_coverage_html_report(report) From be2814fb8db56982807fa9dd25aafee12f156ded Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 01:52:04 +0300 Subject: [PATCH 06/13] updates xpath validator --- .../adapters/xpath_coverage_runner.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/ted_sws/notice_validator/adapters/xpath_coverage_runner.py b/ted_sws/notice_validator/adapters/xpath_coverage_runner.py index fb3905454..92b2f1a6a 100644 --- a/ted_sws/notice_validator/adapters/xpath_coverage_runner.py +++ b/ted_sws/notice_validator/adapters/xpath_coverage_runner.py @@ -3,7 +3,7 @@ from typing import List, Set, Dict from jinja2 import Environment, PackageLoader -from saxonche import PySaxonProcessor, PySaxonApiError +from saxonche import PySaxonProcessor, PySaxonApiError, PyXPathProcessor from ted_sws.core.model.manifestation import XPATHCoverageValidationReport, XPATHCoverageValidationAssertion, \ XPATHCoverageValidationResult @@ -51,8 +51,9 @@ def xpath_coverage_validation_report(self, notice: Notice) -> XPATHCoverageValid mapping_suite_identifier=self.mapping_suite_id) xpaths: List[str] = [] + xp = self.init_xp_processor(notice) for xpath in self.get_all_conceptual_xpaths(): - if self.check_xpath_expression_with_xml(notice.xml_manifestation.object_data, xpath): + if self.check_xpath_expression(xpath, xp): xpaths.append(xpath) notice_xpaths: XPathDict = {notice.ted_id: xpaths} self.validate_xpath_coverage_report(report, notice_xpaths, xpaths) @@ -77,19 +78,25 @@ def extract_namespaces(cls, xml_content): return namespaces @classmethod - def check_xpath_expression_with_xml(cls, xml_content, xpath_expression) -> bool: + def init_xp_processor(cls, notice: Notice) -> PyXPathProcessor: + xml_content = notice.xml_manifestation.object_data namespaces = cls.extract_namespaces(xml_content) - with PySaxonProcessor(license=False) as proc: - xp = proc.new_xpath_processor() - for prefix, ns_uri in namespaces.items(): - xp.declare_namespace(prefix, ns_uri) - document = proc.parse_xml(xml_text=xml_content) - xp.set_context(xdm_item=document) - try: - item = xp.evaluate_single(xpath_expression) - return True if item else False - except PySaxonApiError: - return False + proc = PySaxonProcessor(license=False) + xp = proc.new_xpath_processor() + for prefix, ns_uri in namespaces.items(): + xp.declare_namespace(prefix, ns_uri) + document = proc.parse_xml(xml_text=xml_content) + xp.set_context(xdm_item=document) + + return xp + + @classmethod + def check_xpath_expression(cls, xpath_expression: str, xp: PyXPathProcessor) -> bool: + try: + item = xp.evaluate_single(xpath_expression) + return True if item else False + except PySaxonApiError: + return False def xpath_assertions( self, notice_xpaths: XPathDict, xpaths_list: List[str] @@ -130,10 +137,9 @@ def xpath_coverage_validation_summary_report( for report_notice in notices: notice = report_notice.notice xpaths: List[str] = [] + xp = self.init_xp_processor(report_notice.notice) for xpath in self.get_all_conceptual_xpaths(): - if self.check_xpath_expression_with_xml( - report_notice.notice.xml_manifestation.object_data, xpath - ): + if self.check_xpath_expression(xpath, xp): xpaths.append(xpath) notice_xpaths[notice.ted_id] = xpaths From b0f5ff7f7b5bc049df5a0ad2ecc23270968b3ddb Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 02:01:13 +0300 Subject: [PATCH 07/13] updates xpath validator --- ted_sws/notice_validator/adapters/xpath_coverage_runner.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ted_sws/notice_validator/adapters/xpath_coverage_runner.py b/ted_sws/notice_validator/adapters/xpath_coverage_runner.py index 92b2f1a6a..e473b3fd6 100644 --- a/ted_sws/notice_validator/adapters/xpath_coverage_runner.py +++ b/ted_sws/notice_validator/adapters/xpath_coverage_runner.py @@ -10,7 +10,6 @@ from ted_sws.core.model.notice import Notice from ted_sws.core.model.transform import MappingXPATH, MappingSuite from ted_sws.core.model.validation_report import ReportNotice -from ted_sws.data_sampler.services.notice_xml_indexer import index_notice from ted_sws.mapping_suite_processor.adapters.mapping_suite_reader import MappingSuiteReader from ted_sws.notice_transformer.services.notice_transformer import transform_report_notices from ted_sws.notice_validator.resources.templates import TEMPLATE_METADATA_KEY @@ -34,12 +33,6 @@ def __init__(self, mapping_suite: MappingSuite): self.mapping_suite_id = mapping_suite.get_mongodb_id() self.init_xpath_data(mapping_suite=mapping_suite) - @classmethod - def notice_xpaths(cls, notice: Notice) -> List[str]: - if not notice.xml_metadata or not notice.xml_metadata.unique_xpaths: - notice = index_notice(notice) - return notice.xml_metadata.unique_xpaths - def init_xpath_data(self, mapping_suite: MappingSuite): for cm_xpath in MappingSuiteReader.read_mapping_suite_xpaths(mapping_suite): self.conceptual_xpaths.add(cm_xpath.xpath) From 1d49e15d96db9c355db6d885b266e9ba1a64905b Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 10:31:43 +0300 Subject: [PATCH 08/13] updates xpath validator --- .../templates/xpath_coverage_report.jinja2 | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 b/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 index 250142bc2..d7b5b8676 100644 --- a/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 +++ b/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 @@ -88,6 +88,16 @@ {% if template_metadata.grouping %} <li><b>Grouping:</b> {{ template_metadata.grouping }}</li> {% endif %} + <li> + <div><hr></div> + <h4>Overall coverage</h4> + <h3>{{ "%.2f"|format((validation_result.coverage|float * 100)) }}%</h3> + <p>(Number of Conceptual XPATHs found in notices) / (Total number of unique XPATHs in provided notices)</p> + <li><div><hr></div> + <h4>Conceptual coverage</h4> + <h3>{{ "%.2f"|format((validation_result.conceptual_coverage|float * 100)) }}%</h3> + <p>(Number of Conceptual XPATHs found in notices) / (Total number of unique Conceptual XPATHs)</p> + </li> {% set nr_notices = notices|length %} <li><div><hr></div> {% if nr_notices > 1 %} @@ -140,7 +150,24 @@ </tbody> </table> {% endif %} - +{% if validation_result.xpath_covered|length > 0 %} +<hr> +<h2>XPATHs covered in the "Rules" of Conceptual Mapping</h2> +<table class="display summary" data-order='[[0, "asc"]]'> + <thead> + <tr> + <th>XPATH</th> + </tr> + </thead> + <tbody> + {% for xpath in validation_result.xpath_covered %} + <tr> + <td class="break-word">{{ xpath }}</td> + </tr> + {% endfor %} + </tbody> +</table> +{% endif %} <hr> </body> </html> From a43cf6058b99adc950ae638337fb17cbbdb62524 Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 10:52:15 +0300 Subject: [PATCH 09/13] updates xpath validator --- Makefile | 6 +++--- infra/airflow-cluster/Dockerfile | 4 ++-- infra/airflow-cluster/docker-compose-worker.yaml | 2 +- infra/airflow-cluster/docker-compose.yaml | 2 +- infra/airflow/Dockerfile | 4 ++-- infra/airflow/docker-compose.yaml | 2 +- infra/aws/airflow.yml | 10 +++++----- infra/aws/worker.yml | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 440654d68..62913e4ce 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ ENV_FILE := .env PROJECT_PATH = $(shell pwd) AIRFLOW_INFRA_FOLDER ?= ${PROJECT_PATH}/.airflow RML_MAPPER_PATH = ${PROJECT_PATH}/.rmlmapper/rmlmapper.jar -XML_PROCESSOR_PATH = ${PROJECT_PATH}/.saxon/saxon-he-10.6.jar +XML_PROCESSOR_PATH = ${PROJECT_PATH}/.saxon/saxon-he-10.9.jar LIMES_ALIGNMENT_PATH = $(PROJECT_PATH)/.limes/limes.jar HOSTNAME = $(shell hostname) CAROOT = $(shell pwd)/infra/traefik/certs @@ -222,8 +222,8 @@ init-limes: init-saxon: @ echo -e "$(BUILD_PRINT)Saxon folder initialization $(END_BUILD_PRINT)" - @ wget -c https://kumisystems.dl.sourceforge.net/project/saxon/Saxon-HE/10/Java/SaxonHE10-6J.zip -P .saxon/ - @ cd .saxon && unzip SaxonHE10-6J.zip && rm -rf SaxonHE10-6J.zip + @ wget -c https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE10-9/SaxonHE10-9J.zip -P .saxon/ + @ cd .saxon && unzip SaxonHE10-9J.zip && rm -rf SaxonHE10-9J.zip start-project-services: | start-airflow start-mongo init-rml-mapper init-limes start-allegro-graph start-metabase stop-project-services: | stop-airflow stop-mongo stop-allegro-graph stop-metabase diff --git a/infra/airflow-cluster/Dockerfile b/infra/airflow-cluster/Dockerfile index 90cd6323e..f572786a5 100644 --- a/infra/airflow-cluster/Dockerfile +++ b/infra/airflow-cluster/Dockerfile @@ -18,8 +18,8 @@ RUN mkdir -p ./.rmlmapper RUN wget -c https://github.com/RMLio/rmlmapper-java/releases/download/v6.2.2/rmlmapper-6.2.2-r371-all.jar -O ./.rmlmapper/rmlmapper.jar -RUN wget -c https://kumisystems.dl.sourceforge.net/project/saxon/Saxon-HE/10/Java/SaxonHE10-6J.zip -P .saxon/ -RUN cd .saxon && unzip SaxonHE10-6J.zip && rm -rf SaxonHE10-6J.zip +RUN wget -c https://kumisystems.dl.sourceforge.net/project/saxon/Saxon-HE/10/Java/SaxonHE10-9J.zip -P .saxon/ +RUN cd .saxon && unzip SaxonHE10-9J.zip && rm -rf SaxonHE10-9J.zip RUN mkdir -p ./.limes diff --git a/infra/airflow-cluster/docker-compose-worker.yaml b/infra/airflow-cluster/docker-compose-worker.yaml index f394e54d7..070c5c6b9 100644 --- a/infra/airflow-cluster/docker-compose-worker.yaml +++ b/infra/airflow-cluster/docker-compose-worker.yaml @@ -76,7 +76,7 @@ x-airflow-common: PYTHONPATH: /opt/airflow/ AIRFLOW_HOME: /opt/airflow RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar DAG_LOGGER_CONFIG_HANDLERS: ${DAG_LOGGER_CONFIG_HANDLERS} volumes: diff --git a/infra/airflow-cluster/docker-compose.yaml b/infra/airflow-cluster/docker-compose.yaml index 8b134a72d..cc0c7e942 100644 --- a/infra/airflow-cluster/docker-compose.yaml +++ b/infra/airflow-cluster/docker-compose.yaml @@ -78,7 +78,7 @@ x-airflow-common: PYTHONPATH: /opt/airflow/ AIRFLOW_HOME: /opt/airflow RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar DAG_LOGGER_CONFIG_HANDLERS: ${DAG_LOGGER_CONFIG_HANDLERS} volumes: diff --git a/infra/airflow/Dockerfile b/infra/airflow/Dockerfile index 076f48713..409c94dcf 100644 --- a/infra/airflow/Dockerfile +++ b/infra/airflow/Dockerfile @@ -18,8 +18,8 @@ RUN mkdir -p ./.rmlmapper ./dags ./ted_sws ./temp RUN wget -c https://github.com/RMLio/rmlmapper-java/releases/download/v6.2.2/rmlmapper-6.2.2-r371-all.jar -O ./.rmlmapper/rmlmapper.jar -RUN wget -c https://kumisystems.dl.sourceforge.net/project/saxon/Saxon-HE/10/Java/SaxonHE10-6J.zip -P .saxon/ -RUN cd .saxon && unzip SaxonHE10-6J.zip && rm -rf SaxonHE10-6J.zip +RUN wget -c https://kumisystems.dl.sourceforge.net/project/saxon/Saxon-HE/10/Java/SaxonHE10-9J.zip -P .saxon/ +RUN cd .saxon && unzip SaxonHE10-9J.zip && rm -rf SaxonHE10-9J.zip RUN mkdir -p ./.limes RUN wget -c https://github.com/dice-group/LIMES/releases/download/1.7.9/limes.jar -P ./.limes diff --git a/infra/airflow/docker-compose.yaml b/infra/airflow/docker-compose.yaml index e196b4078..026e4bd98 100644 --- a/infra/airflow/docker-compose.yaml +++ b/infra/airflow/docker-compose.yaml @@ -68,7 +68,7 @@ x-airflow-common: PYTHONPATH: /opt/airflow/ AIRFLOW_HOME: /opt/airflow RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar DAG_LOGGER_CONFIG_HANDLERS: ${DAG_LOGGER_CONFIG_HANDLERS} volumes: diff --git a/infra/aws/airflow.yml b/infra/aws/airflow.yml index e7e4f5b43..d7db49d2c 100644 --- a/infra/aws/airflow.yml +++ b/infra/aws/airflow.yml @@ -32,7 +32,7 @@ services: PYTHONPATH: /opt/airflow/ LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar AIRFLOW_UID: ${AIRFLOW_UID} image: ${AIRFLOW_IMAGE_URI} logging: @@ -77,7 +77,7 @@ services: PYTHONPATH: /opt/airflow/ LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar healthcheck: test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] interval: 10s @@ -125,7 +125,7 @@ services: PYTHONPATH: /opt/airflow/ LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar image: ${AIRFLOW_IMAGE_URI} logging: driver: awslogs @@ -168,7 +168,7 @@ services: PYTHONPATH: /opt/airflow/ LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar image: ${AIRFLOW_IMAGE_URI} logging: driver: awslogs @@ -215,7 +215,7 @@ services: PYTHONPATH: /opt/airflow/ LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar image: ${AIRFLOW_IMAGE_URI} labels: ecs-local.task-definition-input.type: remote diff --git a/infra/aws/worker.yml b/infra/aws/worker.yml index c9a8a865b..28b4b39f4 100644 --- a/infra/aws/worker.yml +++ b/infra/aws/worker.yml @@ -28,7 +28,7 @@ services: PYTHONPATH: /opt/airflow/ LIMES_ALIGNMENT_PATH: /opt/airflow/.limes/limes.jar RML_MAPPER_PATH: /opt/airflow/.rmlmapper/rmlmapper.jar - XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.6.jar + XML_PROCESSOR_PATH: /opt/airflow/.saxon/saxon-he-10.9.jar image: ${AIRFLOW_IMAGE_URI} # hostname: "worker1" ports: From f1564299d3418df394d5879f1487e050771e7292 Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 10:57:33 +0300 Subject: [PATCH 10/13] updates xpath validator --- .../resources/templates/xpath_coverage_report.jinja2 | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 b/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 index d7b5b8676..8d16a2ea6 100644 --- a/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 +++ b/ted_sws/notice_validator/resources/templates/xpath_coverage_report.jinja2 @@ -88,16 +88,6 @@ {% if template_metadata.grouping %} <li><b>Grouping:</b> {{ template_metadata.grouping }}</li> {% endif %} - <li> - <div><hr></div> - <h4>Overall coverage</h4> - <h3>{{ "%.2f"|format((validation_result.coverage|float * 100)) }}%</h3> - <p>(Number of Conceptual XPATHs found in notices) / (Total number of unique XPATHs in provided notices)</p> - <li><div><hr></div> - <h4>Conceptual coverage</h4> - <h3>{{ "%.2f"|format((validation_result.conceptual_coverage|float * 100)) }}%</h3> - <p>(Number of Conceptual XPATHs found in notices) / (Total number of unique Conceptual XPATHs)</p> - </li> {% set nr_notices = notices|length %} <li><div><hr></div> {% if nr_notices > 1 %} From 5d2fca31f974a97d5cac555377a9dbf68f4667b4 Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 11:02:13 +0300 Subject: [PATCH 11/13] updates xpath validator --- infra/airflow-cluster/Dockerfile | 2 +- infra/airflow/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infra/airflow-cluster/Dockerfile b/infra/airflow-cluster/Dockerfile index f572786a5..e252ba3b0 100644 --- a/infra/airflow-cluster/Dockerfile +++ b/infra/airflow-cluster/Dockerfile @@ -18,7 +18,7 @@ RUN mkdir -p ./.rmlmapper RUN wget -c https://github.com/RMLio/rmlmapper-java/releases/download/v6.2.2/rmlmapper-6.2.2-r371-all.jar -O ./.rmlmapper/rmlmapper.jar -RUN wget -c https://kumisystems.dl.sourceforge.net/project/saxon/Saxon-HE/10/Java/SaxonHE10-9J.zip -P .saxon/ +RUN wget -c https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE10-9/SaxonHE10-9J.zip -P .saxon/ RUN cd .saxon && unzip SaxonHE10-9J.zip && rm -rf SaxonHE10-9J.zip diff --git a/infra/airflow/Dockerfile b/infra/airflow/Dockerfile index 409c94dcf..7fe125867 100644 --- a/infra/airflow/Dockerfile +++ b/infra/airflow/Dockerfile @@ -18,7 +18,7 @@ RUN mkdir -p ./.rmlmapper ./dags ./ted_sws ./temp RUN wget -c https://github.com/RMLio/rmlmapper-java/releases/download/v6.2.2/rmlmapper-6.2.2-r371-all.jar -O ./.rmlmapper/rmlmapper.jar -RUN wget -c https://kumisystems.dl.sourceforge.net/project/saxon/Saxon-HE/10/Java/SaxonHE10-9J.zip -P .saxon/ +RUN wget -c https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE10-9/SaxonHE10-9J.zip -P .saxon/ RUN cd .saxon && unzip SaxonHE10-9J.zip && rm -rf SaxonHE10-9J.zip RUN mkdir -p ./.limes From 6446a16a6697c3195910bad9060bb2411d60a442 Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 12:22:23 +0300 Subject: [PATCH 12/13] updates xpath validator --- ted_sws/notice_packager/services/metadata_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ted_sws/notice_packager/services/metadata_transformer.py b/ted_sws/notice_packager/services/metadata_transformer.py index cf08db01f..59d87f525 100644 --- a/ted_sws/notice_packager/services/metadata_transformer.py +++ b/ted_sws/notice_packager/services/metadata_transformer.py @@ -144,4 +144,4 @@ def publication_work_oj_identifier(notice_id, notice_metadata): def filled_ojs_issue_number(ojs_issue_number: str) -> str: # just return the number without any preceding 0 (leaved the formula as it is in case of revert) - return ojs_issue_number.zfill(0) + return ojs_issue_number.split('/')[0].zfill(0) From 0dbac92e06804afc2fa711fd56bb5fc1b23fe739 Mon Sep 17 00:00:00 2001 From: Kolea PLESCO <kaleanych@UNIVERSE.local> Date: Tue, 9 Apr 2024 12:52:09 +0300 Subject: [PATCH 13/13] updates xpath validator --- .../adapters/mapping_suite_reader.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py b/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py index 0b26f84b5..6d47528c2 100644 --- a/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py +++ b/ted_sws/mapping_suite_processor/adapters/mapping_suite_reader.py @@ -62,19 +62,16 @@ def read_mapping_suite_xpaths(cls, mapping_suite: MappingSuite) -> List[MappingX processed_xpaths = set() for test_suite in mapping_suite.sparql_test_suites: - if test_suite.identifier != CONCEPTUAL_MAPPINGS_ASSERTIONS: - continue - for sparql_test in test_suite.sparql_tests: metadata = cls.extract_metadata_from_sparql_query(sparql_test.file_content) - xpath = metadata[SPARQL_QUERY_METADATA_XPATH] - if xpath not in processed_xpaths: - cm_xpath: MappingXPATH = MappingXPATH( - xpath=xpath, - form_field=metadata[SPARQL_QUERY_METADATA_TITLE] - ) - xpaths.append(cm_xpath) - processed_xpaths.add(xpath) - break + if SPARQL_QUERY_METADATA_XPATH in metadata: + xpath = metadata[SPARQL_QUERY_METADATA_XPATH] + if xpath not in processed_xpaths: + cm_xpath: MappingXPATH = MappingXPATH( + xpath=xpath, + form_field=metadata[SPARQL_QUERY_METADATA_TITLE] + ) + xpaths.append(cm_xpath) + processed_xpaths.add(xpath) return xpaths