Skip to content

Commit

Permalink
integrate TED-API v3 and add eForms sample data
Browse files Browse the repository at this point in the history
  • Loading branch information
CaptainOfHacks committed Feb 9, 2024
1 parent 44b8a0b commit 5636c54
Show file tree
Hide file tree
Showing 79 changed files with 24,148 additions and 90 deletions.
43 changes: 9 additions & 34 deletions ted_sws/core/model/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

""" """
from enum import Enum
from typing import List, Optional
from typing import List, Optional, Union

from pydantic import Field
from pydantic import Field, validator
from pydantic.annotated_types import NamedTuple

from ted_sws.core.model import PropertyBaseModel
Expand Down Expand Up @@ -124,39 +124,14 @@ class NormalisedMetadataView(Metadata):
eform_sdk_version: Optional[str]



class TEDMetadata(Metadata):
"""
Stores notice original metadata
"""
AA: List[str] = None
AC: str = None
CY: List[str] = None
DD: str = None
DI: str = None
DS: str = None
DT: List[str] = None
MA: List[str] = None
NC: List[str] = None
ND: str = None
NL: str = None
OC: List[str] = None
OJ: str = None
OL: str = None
OY: List[str] = None
PC: List[str] = None
PD: str = None
PR: str = None
RC: List[str] = None
RN: List[str] = None
RP: str = None
TD: str = None
TVH: str = None
TVL: str = None
TY: str = None
award_criterion_type: str = Field(default=None, alias='award-criterion-type')
corporate_body: List[str] = Field(default=None, alias='corporate-body')
funding: List[str] = None
notice_identifier: str = Field(default=None, alias='notice-identifier')
notice_type: str = Field(default=None, alias='notice-type')
notice_version: str = Field(default=None, alias='notice-version')
ND: Optional[str] = None
PD: Optional[str] = None
# ------------------------------------------------------------------
# Note: In TED-API v3 this field is str, in past was list
# ------------------------------------------------------------------
RN: Optional[Union[List[str], str]] = None
# ------------------------------------------------------------------
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,14 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
notice_families = defaultdict(list)
for notice in notices:
if notice.original_metadata and notice.original_metadata.RN:
parent_notice_id = notice.original_metadata.RN[0]
parent_notice_id_field = notice.original_metadata.RN
# ------------------------------------------------------------------
# Note: This logic is added to be back compatible with old TED-API data format.
# ------------------------------------------------------------------
if isinstance(parent_notice_id_field, list):
parent_notice_id_field = parent_notice_id_field[0]
# ------------------------------------------------------------------
parent_notice_id = parent_notice_id_field
parent_notice_id = f"{parent_notice_id[4:]}-{parent_notice_id[:4]}"
notice_families[parent_notice_id].append(notice)

Expand Down
115 changes: 73 additions & 42 deletions ted_sws/notice_fetcher/adapters/ted_api.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,31 @@
import base64
import json
import pathlib
from datetime import date
from typing import List
from typing import List, Generator

import requests

from ted_sws import config
from ted_sws.event_manager.services.log import log_warning
from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC, RequestAPI

DEFAULT_TED_API_QUERY_RESULT_SIZE = {"pageSize": 100,
"pageNum": 1,
"scope": 3
DEFAULT_TED_API_QUERY_RESULT_SIZE = {"limit": 100,
"page": 1,
"scope": "ALL",
}

DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["AA", "AC", "CY", "DD", "DI", "DS", "TVL", "TY",
"DT", "MA", "NC", "ND", "OC", "OJ", "OL", "OY",
"PC", "PD", "PR", "RC", "RN", "RP", "TD", "TVH",
"CONTENT",
# INFO: This query result fields is not supported correctly by TED-API.
#"notice-type", "award-criterion-type", "corporate-body",
#"funding", "notice-identifier", "notice-version"
]}

TOTAL_DOCUMENTS_NUMBER = "total"
RESPONSE_RESULTS = "results"
DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["ND", "PD", "RN"]}

TOTAL_DOCUMENTS_NUMBER = "totalNoticeCount"
RESPONSE_RESULTS = "notices"
DOCUMENT_CONTENT = "content"
RESULT_PAGE_NUMBER = "pageNum"
RESULT_PAGE_NUMBER = "page"
TED_API_FIELDS = "fields"
DOCUMENT_CONTENT_FIELD = "CONTENT"
LINKS_TO_CONTENT_KEY = "links"
XML_CONTENT_KEY = "xml"
MULTIPLE_LANGUAGE_CONTENT_KEY = "MUL"
ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
DOCUMENT_NOTICE_ID_KEY = "ND"


class TedRequestAPI(RequestAPI):
Expand All @@ -40,13 +38,12 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
:return: dict
"""

response = requests.get(api_url, params=api_query)
response = requests.post(api_url, json=api_query)
if response.ok:
response_content = json.loads(response.text)
return response_content
else:
raise Exception(f"The TED-API call failed with: {response}")

raise Exception(f"The TED-API call failed with: {response}, {response.content}, {api_url}")


class TedAPIAdapter(TedAPIAdapterABC):
Expand All @@ -71,7 +68,7 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
:return: List[str]
"""

query = {"q": f"PD=[{wildcard_date}]"}
query = {"query": f"PD={wildcard_date}"}

return self.get_by_query(query=query)

Expand All @@ -83,48 +80,82 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
:return:List[str]
"""

date_filter = f">={start_date.strftime('%Y%m%d')} AND <={end_date.strftime('%Y%m%d')}"
date_filter = f"PD>={start_date.strftime('%Y%m%d')} AND PD<={end_date.strftime('%Y%m%d')}"

query = {"q": f"PD=[{date_filter}]"}
query = {"query": date_filter}

return self.get_by_query(query=query)

def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
def _retrieve_document_content(self, document_content: dict) -> str:
"""
Method to retrieve a document content from the TedApi API
:param document_content:
:return:str '
"""
xml_links = document_content[LINKS_TO_CONTENT_KEY][XML_CONTENT_KEY]
language_key = MULTIPLE_LANGUAGE_CONTENT_KEY
if language_key not in xml_links.keys():
if ENGLISH_LANGUAGE_CONTENT_KEY in xml_links.keys():
language_key = ENGLISH_LANGUAGE_CONTENT_KEY
else:
language_key = xml_links.keys()[0]

log_warning(
f"Language key {MULTIPLE_LANGUAGE_CONTENT_KEY} not found in {document_content[DOCUMENT_NOTICE_ID_KEY]},"
f" and will be used language key {language_key}!")

xml_document_content_link = xml_links[language_key]
response = requests.get(xml_document_content_link)

if response.ok:
return response.text
else:
raise Exception(f"The notice content can't be loaded!: {response}, {response.content}")

def get_generator_by_query(self, query: dict, result_fields: dict = None) -> Generator[dict, None, None]:
"""
Method to get a documents content by passing a query to the API (json)
:param query:
:param result_fields:
:return:List[str]
:return:Generator[dict]
"""
query.update(DEFAULT_TED_API_QUERY_RESULT_SIZE)
query.update(result_fields or DEFAULT_TED_API_QUERY_RESULT_FIELDS)
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)

documents_number = response_body[TOTAL_DOCUMENTS_NUMBER]
result_pages = 1 + int(documents_number) // 100
documents_content = response_body[RESPONSE_RESULTS]

for page_number in range(2, result_pages + 1):
query[RESULT_PAGE_NUMBER] = page_number
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]
if DOCUMENT_CONTENT_FIELD in query[TED_API_FIELDS]:
decoded_documents_content = []
for document_content in documents_content:
document_content[DOCUMENT_CONTENT] = base64.b64decode(document_content[DOCUMENT_CONTENT]).decode(
encoding="utf-8")
decoded_documents_content.append(document_content)
return decoded_documents_content
if result_pages > 1:
for page_number in range(2, result_pages + 1):
query[RESULT_PAGE_NUMBER] = page_number
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]
for document_content in documents_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content
else:
return documents_content
for document_content in documents_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content

def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
"""
Method to get a documents content by passing a query to the API (json)
:param query:
:param result_fields:
:return:List[dict]
"""
return list(self.get_generator_by_query(query=query, result_fields=result_fields))

def get_by_id(self, document_id: str) -> dict:
"""
Method to get a document content by passing an ID
:param document_id:
:return: str
:return: dict
"""

query = {"q": f"ND=[{document_id}]"}
query = {"query": f"ND={document_id}"}

return self.get_by_query(query=query)[0]
31 changes: 31 additions & 0 deletions tests/e2e/notice_fetcher/_test_generate_eforms_sample_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pathlib

TED_API_EFORMS_QUERY = """
TD NOT IN (C E G I D P M Q O R 0 1 2 3 4 5 6 7 8 9 B S Y V F A H J K) AND
notice-subtype IN ({eforms_subtype}) AND
FT~"eforms-sdk-{eforms_sdk_version}"
"""

EFORMS_SUBTYPES = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
EFORMS_SDK_VERSIONS = [f"1.{version}" for version in range(3, 11)]


def _test_generate_eforms_sample_dataset(ted_document_search):
results_path = pathlib.Path(__file__).parent / "eforms_samples"

for eforms_sdk_version in EFORMS_SDK_VERSIONS:
for eforms_subtype in EFORMS_SUBTYPES:
results_dir_path = results_path / f"eforms_sdk_v{eforms_sdk_version}" / f"eform_subtype_{eforms_subtype}"

print(f"Load for {results_dir_path}")
query = {"query": TED_API_EFORMS_QUERY.format(eforms_sdk_version=eforms_sdk_version,
eforms_subtype=eforms_subtype)}
print(query)
notices = ted_document_search.get_generator_by_query(query=query)
for sample_id in range(1, 2):
notice = next(notices, None)
if notice is None:
break
results_dir_path.mkdir(parents=True, exist_ok=True)
result_notice_xml_path = results_dir_path / f"{notice['ND']}.xml"
result_notice_xml_path.write_text(notice["content"], encoding="utf-8")
5 changes: 3 additions & 2 deletions tests/e2e/notice_fetcher/test_notice_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@


def test_notice_fetcher_by_identifier(notice_repository, ted_document_search):
document_id = "067623-2022"
document_id = "67623-2022"
NoticeFetcher(notice_repository=notice_repository, ted_api_adapter=ted_document_search).fetch_notice_by_id(
document_id=document_id)
notice = notice_repository.get(reference=document_id)
assert notice is not None
assert isinstance(notice, Notice)
assert notice
assert notice.original_metadata
Expand All @@ -18,7 +19,7 @@ def test_notice_fetcher_by_identifier(notice_repository, ted_document_search):


def test_notice_fetcher_by_search_query(notice_repository, ted_document_search):
query = {"q": "ND=[67623-2022]"}
query = {"query": "ND=67623-2022"}

NoticeFetcher(notice_repository=notice_repository, ted_api_adapter=ted_document_search).fetch_notices_by_query(
query=query)
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/notice_fetcher/test_ted_request_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_ted_request_api():
ted_api_request = TedRequestAPI()
notice_by_query = ted_api_request(api_url=config.TED_API_URL, api_query={"q": "ND=[67623-2022]"})
notice_by_query = ted_api_request(api_url=config.TED_API_URL, api_query={"query": "ND=[67623-2022]"})
assert notice_by_query
assert isinstance(notice_by_query, dict)
with pytest.raises(Exception) as e:
Expand Down
10 changes: 5 additions & 5 deletions tests/fakes/fake_ted_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def get_fake_api_response() -> dict:
path = TEST_DATA_PATH / "notices" / "2021-OJS237-623049.json"
path = TEST_DATA_PATH / "notice_fetcher" / "ted_api_response" / "ted_api_response.json"
return json.loads(path.read_text())


Expand Down Expand Up @@ -38,15 +38,15 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
:param wildcard_date:
:return:
"""
return [notice_data for notice_data in get_fake_api_response()["results"]]
return [notice_data for notice_data in get_fake_api_response()["notices"]]

def get_by_id(self, document_id: str) -> dict:
"""
:param document_id:
:return:
"""
return get_fake_api_response()["results"][0]
return get_fake_api_response()["notices"][0]

def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
"""
Expand All @@ -55,7 +55,7 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
:param end_date:
:return:
"""
return [notice_data for notice_data in get_fake_api_response()["results"]]
return [notice_data for notice_data in get_fake_api_response()["notices"]]

def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
"""
Expand All @@ -64,4 +64,4 @@ def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
:param result_fields:
:return:
"""
return [notice_data for notice_data in get_fake_api_response()["results"]]
return [notice_data for notice_data in get_fake_api_response()["notices"]]
Loading

0 comments on commit 5636c54

Please sign in to comment.