OP-TED · CaptainOfHacks · Feb 20, 2024 · Feb 19, 2024 · Feb 19, 2024 · Feb 19, 2024
diff --git a/ted_sws/data_sampler/entrypoints/notebooks/eform_xml_saxon_indexer.ipynb b/ted_sws/data_sampler/entrypoints/notebooks/eform_xml_saxon_indexer.ipynb
diff --git a/ted_sws/data_sampler/services/notice_xml_indexer.py b/ted_sws/data_sampler/services/notice_xml_indexer.py
@@ -1,6 +1,9 @@
 import pathlib
+import re
 import tempfile
-from typing import List
+import xml.etree.ElementTree as XMLElementTree
+from io import StringIO
+from typing import List, Set, Generator, Optional
 
 from pymongo import MongoClient
 
@@ -10,12 +13,13 @@
 from ted_sws.data_manager.adapters.notice_repository import NoticeRepository
 from ted_sws.mapping_suite_processor.adapters.conceptual_mapping_reader import ConceptualMappingReader
 from ted_sws.resources import XSLT_FILES_PATH
-import xml.etree.ElementTree as XMLElementTree
-import re
 
 UNIQUE_XPATHS_XSLT_FILE_PATH = "get_unique_xpaths.xsl"
 XSLT_PREFIX_RESULT = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
 
+INCLUDE_VALUES_BY_ATTRIBUTES_NAMES = {"schemeName", "unitCode", "listName"}
+EXCLUDE_ATTRIBUTES_VALUES = {"nuts", "country", "cpv"}
+
 
 def index_notice_by_id(notice_id: str, mongodb_client: MongoClient):
     """
@@ -58,8 +62,54 @@ def index_notice_xslt(notice: Notice, xslt_transformer=None) -> Notice:
     return notice
 
 
-def index_notice(notice: Notice, base_xpath="") -> Notice:
+def get_all_xpath_generator(xml_content: str,
+                            remove_namespaces: bool = True,
+                            include_values_by_attribute_names: Optional[Set[str]] = None,
+                            exclude_attribute_values: Optional[Set[str]] = None
+                            ) -> Generator[str, None, None]:
+    """
+        Generate all XPaths based on the given XML content
+    :param xml_content:
+    :param remove_namespaces:
+    :param include_values_by_attribute_names:
+    :param exclude_attribute_values:
+    return: generator of all XPaths based on the given XML content
+    """
+    xml_file = StringIO(xml_content)
+    path = []
+    it = XMLElementTree.iterparse(xml_file, events=('start', 'end'))
+    for evt, el in it:
+        if evt == 'start':
+            if remove_namespaces:
+                ns_tag = re.split('[{}]', el.tag, 2)[1:]
+                path.append(ns_tag[1] if len(ns_tag) > 1 else el.tag)
+            else:
+                path.append(el.tag)
+            xpath = "/" + '/'.join(path)
+            for attribute_key, attribute_value in el.attrib.items():
+                if (attribute_key in include_values_by_attribute_names) and (
+                        attribute_value not in exclude_attribute_values):
+                    yield f"{xpath}@{attribute_key}={attribute_value}"
+                else:
+                    yield f"{xpath}@{attribute_key}"
+            yield xpath
+        else:
+            path.pop()
+
+
+def index_eforms_notice(notice: Notice) -> Notice:
+    xml_content = notice.xml_manifestation.object_data
+    unique_xpaths = list(set(get_all_xpath_generator(xml_content=xml_content, remove_namespaces=True,
+                                                     include_values_by_attribute_names=INCLUDE_VALUES_BY_ATTRIBUTES_NAMES,
+                                                     exclude_attribute_values=EXCLUDE_ATTRIBUTES_VALUES
+                                                     )))
+    xml_metadata = XMLMetadata()
+    xml_metadata.unique_xpaths = unique_xpaths
+    notice.set_xml_metadata(xml_metadata=xml_metadata)
+    return notice
+
 
+def index_notice(notice: Notice, base_xpath="") -> Notice:
     # To be removed later if will not be used
     # def _notice_namespaces(xml_file) -> dict:
     #     _namespaces = dict([node for _, node in XMLElementTree.iterparse(xml_file, events=['start-ns'])])
@@ -229,7 +279,8 @@ def get_unique_xpaths_covered_by_notices(notice_ids: List[str], mongodb_client:
     """
     notice_repository = NoticeRepository(mongodb_client=mongodb_client)
     results = notice_repository.xml_metadata_repository.collection.aggregate([{"$match": {"ted_id": {"$in": notice_ids},
-                                                                                          "metadata_type": {"$eq":"xml"}
+                                                                                          "metadata_type": {
+                                                                                              "$eq": "xml"}
                                                                                           }
                                                                                }], allowDiskUse=True)
     unique_xpaths = set()

diff --git a/ted_sws/notice_fetcher/adapters/ted_api.py b/ted_sws/notice_fetcher/adapters/ted_api.py
@@ -7,7 +7,7 @@
 import requests
 
 from ted_sws import config
-from ted_sws.event_manager.services.log import log_warning
+from ted_sws.event_manager.services.log import log_error
 from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC, RequestAPI
 
 DOCUMENTS_PER_PAGE = 100
@@ -103,18 +103,11 @@ def _retrieve_document_content(self, document_content: dict) -> str:
         :return:str '
         """
         xml_links = document_content[LINKS_TO_CONTENT_KEY][XML_CONTENT_KEY]
-        language_key = MULTIPLE_LANGUAGE_CONTENT_KEY
-        if language_key not in xml_links.keys():
-            if ENGLISH_LANGUAGE_CONTENT_KEY in xml_links.keys():
-                language_key = ENGLISH_LANGUAGE_CONTENT_KEY
-            else:
-                language_key = xml_links.keys()[0]
-
-            log_warning(
-                f"Language key {MULTIPLE_LANGUAGE_CONTENT_KEY} not found in {document_content[DOCUMENT_NOTICE_ID_KEY]},"
-                f" and will be used language key {language_key}!")
-
-        xml_document_content_link = xml_links[language_key]
+        if MULTIPLE_LANGUAGE_CONTENT_KEY not in xml_links.keys():
+            exception_message = f"Language key {MULTIPLE_LANGUAGE_CONTENT_KEY} not found in {document_content[DOCUMENT_NOTICE_ID_KEY]}"
+            log_error(exception_message)
+            raise Exception(exception_message)
+        xml_document_content_link = xml_links[MULTIPLE_LANGUAGE_CONTENT_KEY]
         response = requests.get(xml_document_content_link)
         try_again_request_count = 0
         while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:

diff --git a/tests/e2e/data_sampler/test_unique_xpaths_from_xml.py b/tests/e2e/data_sampler/test_unique_xpaths_from_xml.py
@@ -4,8 +4,26 @@
 from ted_sws.data_sampler.services.notice_xml_indexer import index_notice, index_notice_xslt, index_notice_by_id, \
     get_unique_xpaths_from_notice_repository, get_unique_notice_id_from_notice_repository, \
     get_minimal_set_of_notices_for_coverage_xpaths, get_minimal_set_of_xpaths_for_coverage_notices, \
-    get_unique_notices_id_covered_by_xpaths, get_unique_xpaths_covered_by_notices, get_most_representative_notices
-
+    get_unique_notices_id_covered_by_xpaths, get_unique_xpaths_covered_by_notices, get_most_representative_notices, \
+    index_eforms_notice
+
+
+def test_index_eforms_notice(eform_notice_622690):
+    assert eform_notice_622690.xml_metadata is None
+    indexed_notice = index_eforms_notice(eform_notice_622690)
+    assert indexed_notice is not None
+    assert indexed_notice.xml_metadata is not None
+    assert indexed_notice.xml_metadata.unique_xpaths is not None
+    assert len(indexed_notice.xml_metadata.unique_xpaths) == 218
+    all_xpaths_as_str = "\n".join(indexed_notice.xml_metadata.unique_xpaths)
+    assert "@listName=esubmission" in all_xpaths_as_str
+    assert "@listName=einvoicing" in all_xpaths_as_str
+    assert "@listName=eu-funded" in all_xpaths_as_str
+    assert "@listName=cpv" not in all_xpaths_as_str
+    assert "@listName=country" not in all_xpaths_as_str
+    assert "@listName=nuts" not in all_xpaths_as_str
+    assert "@schemeName=organization" in all_xpaths_as_str
+    assert "@unitCode=DAY" in all_xpaths_as_str
 
 def test_index_notice(notice_2016):
     result_notice = index_notice(notice=notice_2016)

diff --git a/tests/test_data/__init__.py b/tests/test_data/__init__.py
@@ -0,0 +1,4 @@
+import pathlib
+
+EFORMS_SAMPLE_FILE_PATH = pathlib.Path(
+    __file__).parent.resolve() / "eforms_samples" / "eforms_sdk_v1.3" / "eform_subtype_12" / "295399-2023.xml"