Skip to content

Commit

Permalink
Merge pull request #518 from OP-TED/feature/TED4-101
Browse files Browse the repository at this point in the history
integrate TED-API v3 and add eForms sample data
  • Loading branch information
CaptainOfHacks authored Feb 14, 2024
2 parents 44b8a0b + 7d8774d commit 4ced645
Show file tree
Hide file tree
Showing 94 changed files with 24,249 additions and 124 deletions.
43 changes: 9 additions & 34 deletions ted_sws/core/model/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

""" """
from enum import Enum
from typing import List, Optional
from typing import List, Optional, Union

from pydantic import Field
from pydantic import Field, validator
from pydantic.annotated_types import NamedTuple

from ted_sws.core.model import PropertyBaseModel
Expand Down Expand Up @@ -124,39 +124,14 @@ class NormalisedMetadataView(Metadata):
eform_sdk_version: Optional[str]



class TEDMetadata(Metadata):
"""
Stores notice original metadata
"""
AA: List[str] = None
AC: str = None
CY: List[str] = None
DD: str = None
DI: str = None
DS: str = None
DT: List[str] = None
MA: List[str] = None
NC: List[str] = None
ND: str = None
NL: str = None
OC: List[str] = None
OJ: str = None
OL: str = None
OY: List[str] = None
PC: List[str] = None
PD: str = None
PR: str = None
RC: List[str] = None
RN: List[str] = None
RP: str = None
TD: str = None
TVH: str = None
TVL: str = None
TY: str = None
award_criterion_type: str = Field(default=None, alias='award-criterion-type')
corporate_body: List[str] = Field(default=None, alias='corporate-body')
funding: List[str] = None
notice_identifier: str = Field(default=None, alias='notice-identifier')
notice_type: str = Field(default=None, alias='notice-type')
notice_version: str = Field(default=None, alias='notice-version')
ND: Optional[str] = None
PD: Optional[str] = None
# ------------------------------------------------------------------
# Note: In TED-API v3 this field is str, in past was list
# ------------------------------------------------------------------
RN: Optional[Union[List[str], str]] = None
# ------------------------------------------------------------------
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,14 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
notice_families = defaultdict(list)
for notice in notices:
if notice.original_metadata and notice.original_metadata.RN:
parent_notice_id = notice.original_metadata.RN[0]
parent_notice_id_field = notice.original_metadata.RN
# ------------------------------------------------------------------
# Note: This logic is added to be back compatible with old TED-API data format.
# ------------------------------------------------------------------
if isinstance(parent_notice_id_field, list):
parent_notice_id_field = parent_notice_id_field[0]
# ------------------------------------------------------------------
parent_notice_id = parent_notice_id_field
parent_notice_id = f"{parent_notice_id[4:]}-{parent_notice_id[:4]}"
notice_families[parent_notice_id].append(notice)

Expand Down
133 changes: 93 additions & 40 deletions ted_sws/notice_fetcher/adapters/ted_api.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,34 @@
import base64
import json
import time
from datetime import date
from typing import List
from http import HTTPStatus
from typing import List, Generator

import requests

from ted_sws import config
from ted_sws.event_manager.services.log import log_warning
from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC, RequestAPI

DEFAULT_TED_API_QUERY_RESULT_SIZE = {"pageSize": 100,
"pageNum": 1,
"scope": 3
DOCUMENTS_PER_PAGE = 100

DEFAULT_TED_API_QUERY_RESULT_SIZE = {"limit": DOCUMENTS_PER_PAGE,
"page": 1,
"scope": "ALL",
}

DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["AA", "AC", "CY", "DD", "DI", "DS", "TVL", "TY",
"DT", "MA", "NC", "ND", "OC", "OJ", "OL", "OY",
"PC", "PD", "PR", "RC", "RN", "RP", "TD", "TVH",
"CONTENT",
# INFO: This query result fields is not supported correctly by TED-API.
#"notice-type", "award-criterion-type", "corporate-body",
#"funding", "notice-identifier", "notice-version"
]}

TOTAL_DOCUMENTS_NUMBER = "total"
RESPONSE_RESULTS = "results"
DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["ND", "PD", "RN"]}

TOTAL_DOCUMENTS_NUMBER = "totalNoticeCount"
RESPONSE_RESULTS = "notices"
DOCUMENT_CONTENT = "content"
RESULT_PAGE_NUMBER = "pageNum"
RESULT_PAGE_NUMBER = "page"
TED_API_FIELDS = "fields"
DOCUMENT_CONTENT_FIELD = "CONTENT"
LINKS_TO_CONTENT_KEY = "links"
XML_CONTENT_KEY = "xml"
MULTIPLE_LANGUAGE_CONTENT_KEY = "MUL"
ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
DOCUMENT_NOTICE_ID_KEY = "ND"


class TedRequestAPI(RequestAPI):
Expand All @@ -40,15 +41,21 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
:return: dict
"""

response = requests.get(api_url, params=api_query)
response = requests.post(api_url, json=api_query)
try_again_request_count = 0
while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
try_again_request_count += 1
time.sleep(try_again_request_count * 0.1)
response = requests.post(api_url, json=api_query)
if try_again_request_count > 5:
break
if response.ok:
response_content = json.loads(response.text)
return response_content
else:
raise Exception(f"The TED-API call failed with: {response}")



class TedAPIAdapter(TedAPIAdapterABC):
"""
This class will fetch documents content
Expand All @@ -71,7 +78,7 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
:return: List[str]
"""

query = {"q": f"PD=[{wildcard_date}]"}
query = {"query": f"PD={wildcard_date}"}

return self.get_by_query(query=query)

Expand All @@ -83,48 +90,94 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
:return:List[str]
"""

date_filter = f">={start_date.strftime('%Y%m%d')} AND <={end_date.strftime('%Y%m%d')}"
date_filter = f"PD>={start_date.strftime('%Y%m%d')} AND PD<={end_date.strftime('%Y%m%d')}"

query = {"q": f"PD=[{date_filter}]"}
query = {"query": date_filter}

return self.get_by_query(query=query)

def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
def _retrieve_document_content(self, document_content: dict) -> str:
"""
Method to retrieve a document content from the TedApi API
:param document_content:
:return:str '
"""
xml_links = document_content[LINKS_TO_CONTENT_KEY][XML_CONTENT_KEY]
language_key = MULTIPLE_LANGUAGE_CONTENT_KEY
if language_key not in xml_links.keys():
if ENGLISH_LANGUAGE_CONTENT_KEY in xml_links.keys():
language_key = ENGLISH_LANGUAGE_CONTENT_KEY
else:
language_key = xml_links.keys()[0]

log_warning(
f"Language key {MULTIPLE_LANGUAGE_CONTENT_KEY} not found in {document_content[DOCUMENT_NOTICE_ID_KEY]},"
f" and will be used language key {language_key}!")

xml_document_content_link = xml_links[language_key]
response = requests.get(xml_document_content_link)
try_again_request_count = 0
while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
try_again_request_count += 1
time.sleep(try_again_request_count * 0.1)
response = requests.get(xml_document_content_link)
if try_again_request_count > 5:
break
if response.ok:
return response.text
else:
raise Exception(f"The notice content can't be loaded!: {response}, {response.content}")

def get_generator_by_query(self, query: dict, result_fields: dict = None, load_content: bool = True) -> Generator[
dict, None, None]:
"""
Method to get a documents content by passing a query to the API (json)
:param query:
:param result_fields:
:return:List[str]
:param load_content:
:return:Generator[dict]
"""
query.update(DEFAULT_TED_API_QUERY_RESULT_SIZE)
query.update(result_fields or DEFAULT_TED_API_QUERY_RESULT_FIELDS)
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)

documents_number = response_body[TOTAL_DOCUMENTS_NUMBER]
result_pages = 1 + int(documents_number) // 100
result_pages = 1 + int(documents_number) // DOCUMENTS_PER_PAGE
documents_content = response_body[RESPONSE_RESULTS]
if result_pages > 1:
for page_number in range(2, result_pages + 1):
query[RESULT_PAGE_NUMBER] = page_number
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]

for page_number in range(2, result_pages + 1):
query[RESULT_PAGE_NUMBER] = page_number
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]
if DOCUMENT_CONTENT_FIELD in query[TED_API_FIELDS]:
decoded_documents_content = []
for document_content in documents_content:
document_content[DOCUMENT_CONTENT] = base64.b64decode(document_content[DOCUMENT_CONTENT]).decode(
encoding="utf-8")
decoded_documents_content.append(document_content)
return decoded_documents_content
if load_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content
else:
return documents_content
for document_content in documents_content:
if load_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content

def get_by_query(self, query: dict, result_fields: dict = None, load_content: bool = True) -> List[dict]:
"""
Method to get a documents content by passing a query to the API (json)
:param query:
:param result_fields:
:param load_content:
:return:List[dict]
"""
return list(self.get_generator_by_query(query=query, result_fields=result_fields, load_content=load_content))

def get_by_id(self, document_id: str) -> dict:
"""
Method to get a document content by passing an ID
:param document_id:
:return: str
:return: dict
"""

query = {"q": f"ND=[{document_id}]"}
query = {"query": f"ND={document_id}"}

return self.get_by_query(query=query)[0]
5 changes: 4 additions & 1 deletion ted_sws/notice_fetcher/services/notice_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@ def _create_notice(self, notice_data: dict) -> Notice:
:param notice_data:
:return:
"""
xml_manifestation = XMLManifestation(object_data=notice_data["content"])
try:
xml_manifestation = XMLManifestation(object_data=notice_data["content"])
except Exception as e:
raise Exception(str(e), notice_data)
del notice_data["content"]
ted_id = notice_data["ND"]
original_metadata = TEDMetadata(**notice_data)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def validate_and_update_daily_supra_notice(ted_publication_date: day_type, mongo
fetched_notice_ids = set(fetched_notice_ids_list)

ted_api_adapter: TedAPIAdapter = TedAPIAdapter(request_api=request_api)
query = {"q": f"PD=[{ted_publication_date.strftime('%Y%m%d*')}]"}
documents = ted_api_adapter.get_by_query(query=query, result_fields={"fields": ["ND"]})
query = {"query": f"PD={ted_publication_date.strftime('%Y%m%d*')}"}
documents = ted_api_adapter.get_by_query(query=query, result_fields={"fields": ["ND"]}, load_content=False)
api_notice_ids_list = [document["ND"] for document in documents] if documents and len(documents) else []
api_notice_ids = set(api_notice_ids_list)

Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

@pytest.fixture
def notice_id():
return "067623-2022"
return "67623-2022"


@pytest.fixture
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/data_manager/test_mongodb_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def test_mongo_db_query_2():

def test_create_matview_for_notices(fake_mongodb_client):
notice_id = "696661-2022"
ted_api_query = {"q": f"ND=[{notice_id}]"}
ted_api_query = {"query": f"ND={notice_id}"}
mongodb_client = fake_mongodb_client
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
NoticeFetcher(notice_repository=notice_repository,
Expand Down
8 changes: 4 additions & 4 deletions tests/e2e/data_manager/test_notice_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ def test_notice_repository_create(mongodb_client):
notice_repository = NoticeRepository(mongodb_client=mongodb_client, database_name=TEST_DATABASE_NAME)
notice = Notice(ted_id=NOTICE_TED_ID)
notice.set_xml_manifestation(XMLManifestation(object_data="HELLO"))
notice.set_original_metadata(TEDMetadata(**{"AA": ["Metadata"]}))
notice.set_original_metadata(TEDMetadata(**{"RN": ["Metadata"]}))
notice_repository.add(notice)
result_notice = notice_repository.get(reference=NOTICE_TED_ID)
assert result_notice
assert result_notice.ted_id == NOTICE_TED_ID
assert result_notice.original_metadata.AA == ["Metadata"]
assert result_notice.original_metadata.RN == ["Metadata"]
result_notices = list(notice_repository.list())
assert result_notices
assert len(result_notices) == 1
notice_repository.add(notice)
notice.set_original_metadata(ted_metadata=TEDMetadata(**{"AA": ["Updated metadata"]}))
notice.set_original_metadata(ted_metadata=TEDMetadata(**{"RN": ["Updated metadata"]}))
notice_repository.update(notice)
result_notice = notice_repository.get(reference=NOTICE_TED_ID)
assert result_notice
assert result_notice.ted_id == NOTICE_TED_ID
assert result_notice.original_metadata.AA == ["Updated metadata"]
assert result_notice.original_metadata.RN == ["Updated metadata"]
mongodb_client.drop_database(TEST_DATABASE_NAME)


Expand Down
40 changes: 40 additions & 0 deletions tests/e2e/notice_fetcher/_test_generate_eforms_sample_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pathlib

TED_API_EFORMS_QUERY = """
TD NOT IN (C E G I D P M Q O R 0 1 2 3 4 5 6 7 8 9 B S Y V F A H J K) AND
notice-subtype IN ({eforms_subtype}) AND
FT~"eforms-sdk-{eforms_sdk_version}"
"""

EFORMS_SUBTYPES = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
EFORMS_SDK_VERSIONS = [f"1.{version}" for version in range(3, 11)]


def _test_generate_eforms_sample_dataset(ted_document_search):
results_path = pathlib.Path(__file__).parent / "eforms_samples"

for eforms_sdk_version in EFORMS_SDK_VERSIONS:
for eforms_subtype in EFORMS_SUBTYPES:
results_dir_path = results_path / f"eforms_sdk_v{eforms_sdk_version}" / f"eform_subtype_{eforms_subtype}"

print(f"Load for {results_dir_path}")
query = {"query": TED_API_EFORMS_QUERY.format(eforms_sdk_version=eforms_sdk_version,
eforms_subtype=eforms_subtype)}
print(query)
notices = ted_document_search.get_generator_by_query(query=query)
for sample_id in range(1, 2):
notice = next(notices, None)
if notice is None:
break
results_dir_path.mkdir(parents=True, exist_ok=True)
result_notice_xml_path = results_dir_path / f"{notice['ND']}.xml"
result_notice_xml_path.write_text(notice["content"], encoding="utf-8")


def test_fetch_notice_by_id(ted_document_search):
notice_id = "067623-2022"
import json
notice_content = ted_document_search.get_by_id(document_id=notice_id)
result_notice_path = pathlib.Path(__file__).parent / "epo_notice.xml"
result_notice_path.write_text(json.dumps(notice_content), encoding="utf-8")

Loading

0 comments on commit 4ced645

Please sign in to comment.