integrate TED-API v3 and add eForms sample data

OP-TED · Feb 9, 2024 · 5636c54 · 5636c54
1 parent 44b8a0b
commit 5636c54
Show file tree

Hide file tree

Showing 79 changed files with 24,148 additions and 90 deletions.
diff --git a/ted_sws/core/model/metadata.py b/ted_sws/core/model/metadata.py
@@ -7,9 +7,9 @@
 
 """ """
 from enum import Enum
-from typing import List, Optional
+from typing import List, Optional, Union
 
-from pydantic import Field
+from pydantic import Field, validator
 from pydantic.annotated_types import NamedTuple
 
 from ted_sws.core.model import PropertyBaseModel
@@ -124,39 +124,14 @@ class NormalisedMetadataView(Metadata):
     eform_sdk_version: Optional[str]
 
 
-
 class TEDMetadata(Metadata):
     """
         Stores notice original metadata
     """
-    AA: List[str] = None
-    AC: str = None
-    CY: List[str] = None
-    DD: str = None
-    DI: str = None
-    DS: str = None
-    DT: List[str] = None
-    MA: List[str] = None
-    NC: List[str] = None
-    ND: str = None
-    NL: str = None
-    OC: List[str] = None
-    OJ: str = None
-    OL: str = None
-    OY: List[str] = None
-    PC: List[str] = None
-    PD: str = None
-    PR: str = None
-    RC: List[str] = None
-    RN: List[str] = None
-    RP: str = None
-    TD: str = None
-    TVH: str = None
-    TVL: str = None
-    TY: str = None
-    award_criterion_type: str = Field(default=None, alias='award-criterion-type')
-    corporate_body: List[str] = Field(default=None, alias='corporate-body')
-    funding: List[str] = None
-    notice_identifier: str = Field(default=None, alias='notice-identifier')
-    notice_type: str = Field(default=None, alias='notice-type')
-    notice_version: str = Field(default=None, alias='notice-version')
+    ND: Optional[str] = None
+    PD: Optional[str] = None
+    # ------------------------------------------------------------------
+    # Note: In TED-API v3 this field is str, in past was list
+    # ------------------------------------------------------------------
+    RN: Optional[Union[List[str], str]] = None
+    # ------------------------------------------------------------------
diff --git a/ted_sws/master_data_registry/services/entity_deduplication.py b/ted_sws/master_data_registry/services/entity_deduplication.py
@@ -239,7 +239,14 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
     notice_families = defaultdict(list)
     for notice in notices:
         if notice.original_metadata and notice.original_metadata.RN:
-            parent_notice_id = notice.original_metadata.RN[0]
+            parent_notice_id_field = notice.original_metadata.RN
+            # ------------------------------------------------------------------
+            # Note: This logic is added to be back compatible with old TED-API data format.
+            # ------------------------------------------------------------------
+            if isinstance(parent_notice_id_field, list):
+                parent_notice_id_field = parent_notice_id_field[0]
+            # ------------------------------------------------------------------
+            parent_notice_id = parent_notice_id_field
             parent_notice_id = f"{parent_notice_id[4:]}-{parent_notice_id[:4]}"
             notice_families[parent_notice_id].append(notice)
 

diff --git a/ted_sws/notice_fetcher/adapters/ted_api.py b/ted_sws/notice_fetcher/adapters/ted_api.py
@@ -1,33 +1,31 @@
-import base64
 import json
+import pathlib
 from datetime import date
-from typing import List
+from typing import List, Generator
 
 import requests
 
 from ted_sws import config
+from ted_sws.event_manager.services.log import log_warning
 from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC, RequestAPI
 
-DEFAULT_TED_API_QUERY_RESULT_SIZE = {"pageSize": 100,
-                                     "pageNum": 1,
-                                     "scope": 3
+DEFAULT_TED_API_QUERY_RESULT_SIZE = {"limit": 100,
+                                     "page": 1,
+                                     "scope": "ALL",
                                      }
 
-DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["AA", "AC", "CY", "DD", "DI", "DS", "TVL", "TY",
-                                                  "DT", "MA", "NC", "ND", "OC", "OJ", "OL", "OY",
-                                                  "PC", "PD", "PR", "RC", "RN", "RP", "TD", "TVH",
-                                                  "CONTENT",
-                                                  # INFO: This query result fields is not supported correctly by TED-API.
-                                                  #"notice-type", "award-criterion-type", "corporate-body",
-                                                  #"funding", "notice-identifier", "notice-version"
-                                                  ]}
-
-TOTAL_DOCUMENTS_NUMBER = "total"
-RESPONSE_RESULTS = "results"
+DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["ND", "PD", "RN"]}
+
+TOTAL_DOCUMENTS_NUMBER = "totalNoticeCount"
+RESPONSE_RESULTS = "notices"
 DOCUMENT_CONTENT = "content"
-RESULT_PAGE_NUMBER = "pageNum"
+RESULT_PAGE_NUMBER = "page"
 TED_API_FIELDS = "fields"
-DOCUMENT_CONTENT_FIELD = "CONTENT"
+LINKS_TO_CONTENT_KEY = "links"
+XML_CONTENT_KEY = "xml"
+MULTIPLE_LANGUAGE_CONTENT_KEY = "MUL"
+ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
+DOCUMENT_NOTICE_ID_KEY = "ND"
 
 
 class TedRequestAPI(RequestAPI):
@@ -40,13 +38,12 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
             :return: dict
         """
 
-        response = requests.get(api_url, params=api_query)
+        response = requests.post(api_url, json=api_query)
         if response.ok:
             response_content = json.loads(response.text)
             return response_content
         else:
-            raise Exception(f"The TED-API call failed with: {response}")
-
+            raise Exception(f"The TED-API call failed with: {response}, {response.content}, {api_url}")
 
 
 class TedAPIAdapter(TedAPIAdapterABC):
@@ -71,7 +68,7 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
         :return: List[str]
         """
 
-        query = {"q": f"PD=[{wildcard_date}]"}
+        query = {"query": f"PD={wildcard_date}"}
 
         return self.get_by_query(query=query)
 
@@ -83,48 +80,82 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
         :return:List[str]
         """
 
-        date_filter = f">={start_date.strftime('%Y%m%d')} AND <={end_date.strftime('%Y%m%d')}"
+        date_filter = f"PD>={start_date.strftime('%Y%m%d')} AND PD<={end_date.strftime('%Y%m%d')}"
 
-        query = {"q": f"PD=[{date_filter}]"}
+        query = {"query": date_filter}
 
         return self.get_by_query(query=query)
 
-    def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
+    def _retrieve_document_content(self, document_content: dict) -> str:
+        """
+        Method to retrieve a document content from the TedApi API
+        :param document_content:
+        :return:str '
+        """
+        xml_links = document_content[LINKS_TO_CONTENT_KEY][XML_CONTENT_KEY]
+        language_key = MULTIPLE_LANGUAGE_CONTENT_KEY
+        if language_key not in xml_links.keys():
+            if ENGLISH_LANGUAGE_CONTENT_KEY in xml_links.keys():
+                language_key = ENGLISH_LANGUAGE_CONTENT_KEY
+            else:
+                language_key = xml_links.keys()[0]
+
+            log_warning(
+                f"Language key {MULTIPLE_LANGUAGE_CONTENT_KEY} not found in {document_content[DOCUMENT_NOTICE_ID_KEY]},"
+                f" and will be used language key {language_key}!")
+
+        xml_document_content_link = xml_links[language_key]
+        response = requests.get(xml_document_content_link)
+
+        if response.ok:
+            return response.text
+        else:
+            raise Exception(f"The notice content can't be loaded!: {response}, {response.content}")
+
+    def get_generator_by_query(self, query: dict, result_fields: dict = None) -> Generator[dict, None, None]:
         """
         Method to get a documents content by passing a query to the API (json)
         :param query:
         :param result_fields:
-        :return:List[str]
+        :return:Generator[dict]
         """
         query.update(DEFAULT_TED_API_QUERY_RESULT_SIZE)
         query.update(result_fields or DEFAULT_TED_API_QUERY_RESULT_FIELDS)
         response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
-
         documents_number = response_body[TOTAL_DOCUMENTS_NUMBER]
         result_pages = 1 + int(documents_number) // 100
         documents_content = response_body[RESPONSE_RESULTS]
-
-        for page_number in range(2, result_pages + 1):
-            query[RESULT_PAGE_NUMBER] = page_number
-            response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
-            documents_content += response_body[RESPONSE_RESULTS]
-        if DOCUMENT_CONTENT_FIELD in query[TED_API_FIELDS]:
-            decoded_documents_content = []
-            for document_content in documents_content:
-                document_content[DOCUMENT_CONTENT] = base64.b64decode(document_content[DOCUMENT_CONTENT]).decode(
-                    encoding="utf-8")
-                decoded_documents_content.append(document_content)
-            return decoded_documents_content
+        if result_pages > 1:
+            for page_number in range(2, result_pages + 1):
+                query[RESULT_PAGE_NUMBER] = page_number
+                response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
+                documents_content += response_body[RESPONSE_RESULTS]
+                for document_content in documents_content:
+                    document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
+                    del document_content[LINKS_TO_CONTENT_KEY]
+                    yield document_content
         else:
-            return documents_content
+            for document_content in documents_content:
+                document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
+                del document_content[LINKS_TO_CONTENT_KEY]
+                yield document_content
+
+    def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
+        """
+        Method to get a documents content by passing a query to the API (json)
+        :param query:
+        :param result_fields:
+        :return:List[dict]
+        """
+        return list(self.get_generator_by_query(query=query, result_fields=result_fields))
 
     def get_by_id(self, document_id: str) -> dict:
         """
         Method to get a document content by passing an ID
         :param document_id:
-        :return: str
+        :return: dict
         """
 
-        query = {"q": f"ND=[{document_id}]"}
+        query = {"query": f"ND={document_id}"}
 
         return self.get_by_query(query=query)[0]
diff --git a/tests/e2e/notice_fetcher/_test_generate_eforms_sample_dataset.py b/tests/e2e/notice_fetcher/_test_generate_eforms_sample_dataset.py
@@ -0,0 +1,31 @@
+import pathlib
+
+TED_API_EFORMS_QUERY = """
+TD NOT IN (C E G I D P M Q O R 0 1 2 3 4 5 6 7 8 9 B S Y V F A H J K) AND
+notice-subtype IN ({eforms_subtype}) AND
+FT~"eforms-sdk-{eforms_sdk_version}"
+"""
+
+EFORMS_SUBTYPES = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+EFORMS_SDK_VERSIONS = [f"1.{version}" for version in range(3, 11)]
+
+
+def _test_generate_eforms_sample_dataset(ted_document_search):
+    results_path = pathlib.Path(__file__).parent / "eforms_samples"
+
+    for eforms_sdk_version in EFORMS_SDK_VERSIONS:
+        for eforms_subtype in EFORMS_SUBTYPES:
+            results_dir_path = results_path / f"eforms_sdk_v{eforms_sdk_version}" / f"eform_subtype_{eforms_subtype}"
+
+            print(f"Load for {results_dir_path}")
+            query = {"query": TED_API_EFORMS_QUERY.format(eforms_sdk_version=eforms_sdk_version,
+                                                          eforms_subtype=eforms_subtype)}
+            print(query)
+            notices = ted_document_search.get_generator_by_query(query=query)
+            for sample_id in range(1, 2):
+                notice = next(notices, None)
+                if notice is None:
+                    break
+                results_dir_path.mkdir(parents=True, exist_ok=True)
+                result_notice_xml_path = results_dir_path / f"{notice['ND']}.xml"
+                result_notice_xml_path.write_text(notice["content"], encoding="utf-8")
diff --git a/tests/e2e/notice_fetcher/test_notice_fetcher.py b/tests/e2e/notice_fetcher/test_notice_fetcher.py
@@ -4,10 +4,11 @@
 
 
 def test_notice_fetcher_by_identifier(notice_repository, ted_document_search):
-    document_id = "067623-2022"
+    document_id = "67623-2022"
     NoticeFetcher(notice_repository=notice_repository, ted_api_adapter=ted_document_search).fetch_notice_by_id(
         document_id=document_id)
     notice = notice_repository.get(reference=document_id)
+    assert notice is not None
     assert isinstance(notice, Notice)
     assert notice
     assert notice.original_metadata
@@ -18,7 +19,7 @@ def test_notice_fetcher_by_identifier(notice_repository, ted_document_search):
 
 
 def test_notice_fetcher_by_search_query(notice_repository, ted_document_search):
-    query = {"q": "ND=[67623-2022]"}
+    query = {"query": "ND=67623-2022"}
 
     NoticeFetcher(notice_repository=notice_repository, ted_api_adapter=ted_document_search).fetch_notices_by_query(
         query=query)

diff --git a/tests/e2e/notice_fetcher/test_ted_request_api.py b/tests/e2e/notice_fetcher/test_ted_request_api.py
@@ -6,7 +6,7 @@
 
 def test_ted_request_api():
     ted_api_request = TedRequestAPI()
-    notice_by_query = ted_api_request(api_url=config.TED_API_URL, api_query={"q": "ND=[67623-2022]"})
+    notice_by_query = ted_api_request(api_url=config.TED_API_URL, api_query={"query": "ND=[67623-2022]"})
     assert notice_by_query
     assert isinstance(notice_by_query, dict)
     with pytest.raises(Exception) as e:

diff --git a/tests/fakes/fake_ted_api.py b/tests/fakes/fake_ted_api.py
@@ -8,7 +8,7 @@
 
 
 def get_fake_api_response() -> dict:
-    path = TEST_DATA_PATH / "notices" / "2021-OJS237-623049.json"
+    path = TEST_DATA_PATH / "notice_fetcher" / "ted_api_response" / "ted_api_response.json"
     return json.loads(path.read_text())
 
 
@@ -38,15 +38,15 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
         :param wildcard_date:
         :return:
         """
-        return [notice_data for notice_data in get_fake_api_response()["results"]]
+        return [notice_data for notice_data in get_fake_api_response()["notices"]]
 
     def get_by_id(self, document_id: str) -> dict:
         """
 
         :param document_id:
         :return:
         """
-        return get_fake_api_response()["results"][0]
+        return get_fake_api_response()["notices"][0]
 
     def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
         """
@@ -55,7 +55,7 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
         :param end_date:
         :return:
         """
-        return [notice_data for notice_data in get_fake_api_response()["results"]]
+        return [notice_data for notice_data in get_fake_api_response()["notices"]]
 
     def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
         """
@@ -64,4 +64,4 @@ def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
         :param result_fields:
         :return:
         """
-        return [notice_data for notice_data in get_fake_api_response()["results"]]
+        return [notice_data for notice_data in get_fake_api_response()["notices"]]