Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

integrate TED-API v3 and add eForms sample data #518

Merged
merged 7 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 9 additions & 34 deletions ted_sws/core/model/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

""" """
from enum import Enum
from typing import List, Optional
from typing import List, Optional, Union

from pydantic import Field
from pydantic import Field, validator
from pydantic.annotated_types import NamedTuple

from ted_sws.core.model import PropertyBaseModel
Expand Down Expand Up @@ -124,39 +124,14 @@ class NormalisedMetadataView(Metadata):
eform_sdk_version: Optional[str]



class TEDMetadata(Metadata):
"""
Stores notice original metadata
"""
AA: List[str] = None
AC: str = None
CY: List[str] = None
DD: str = None
DI: str = None
DS: str = None
DT: List[str] = None
MA: List[str] = None
NC: List[str] = None
ND: str = None
NL: str = None
OC: List[str] = None
OJ: str = None
OL: str = None
OY: List[str] = None
PC: List[str] = None
PD: str = None
PR: str = None
RC: List[str] = None
RN: List[str] = None
RP: str = None
TD: str = None
TVH: str = None
TVL: str = None
TY: str = None
award_criterion_type: str = Field(default=None, alias='award-criterion-type')
corporate_body: List[str] = Field(default=None, alias='corporate-body')
funding: List[str] = None
notice_identifier: str = Field(default=None, alias='notice-identifier')
notice_type: str = Field(default=None, alias='notice-type')
notice_version: str = Field(default=None, alias='notice-version')
ND: Optional[str] = None
PD: Optional[str] = None
# ------------------------------------------------------------------
# Note: In TED-API v3 this field is str, in past was list
# ------------------------------------------------------------------
RN: Optional[Union[List[str], str]] = None
# ------------------------------------------------------------------
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,14 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
notice_families = defaultdict(list)
for notice in notices:
if notice.original_metadata and notice.original_metadata.RN:
parent_notice_id = notice.original_metadata.RN[0]
parent_notice_id_field = notice.original_metadata.RN
# ------------------------------------------------------------------
# Note: This logic is added to be back compatible with old TED-API data format.
# ------------------------------------------------------------------
if isinstance(parent_notice_id_field, list):
parent_notice_id_field = parent_notice_id_field[0]
# ------------------------------------------------------------------
parent_notice_id = parent_notice_id_field
parent_notice_id = f"{parent_notice_id[4:]}-{parent_notice_id[:4]}"
notice_families[parent_notice_id].append(notice)

Expand Down
133 changes: 93 additions & 40 deletions ted_sws/notice_fetcher/adapters/ted_api.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,34 @@
import base64
import json
import time
from datetime import date
from typing import List
from http import HTTPStatus
from typing import List, Generator

import requests

from ted_sws import config
from ted_sws.event_manager.services.log import log_warning
from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC, RequestAPI

DEFAULT_TED_API_QUERY_RESULT_SIZE = {"pageSize": 100,
"pageNum": 1,
"scope": 3
DOCUMENTS_PER_PAGE = 100

DEFAULT_TED_API_QUERY_RESULT_SIZE = {"limit": DOCUMENTS_PER_PAGE,
"page": 1,
"scope": "ALL",
}

DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["AA", "AC", "CY", "DD", "DI", "DS", "TVL", "TY",
"DT", "MA", "NC", "ND", "OC", "OJ", "OL", "OY",
"PC", "PD", "PR", "RC", "RN", "RP", "TD", "TVH",
"CONTENT",
# INFO: This query result fields is not supported correctly by TED-API.
#"notice-type", "award-criterion-type", "corporate-body",
#"funding", "notice-identifier", "notice-version"
]}

TOTAL_DOCUMENTS_NUMBER = "total"
RESPONSE_RESULTS = "results"
DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["ND", "PD", "RN"]}

TOTAL_DOCUMENTS_NUMBER = "totalNoticeCount"
RESPONSE_RESULTS = "notices"
DOCUMENT_CONTENT = "content"
RESULT_PAGE_NUMBER = "pageNum"
RESULT_PAGE_NUMBER = "page"
TED_API_FIELDS = "fields"
DOCUMENT_CONTENT_FIELD = "CONTENT"
LINKS_TO_CONTENT_KEY = "links"
XML_CONTENT_KEY = "xml"
MULTIPLE_LANGUAGE_CONTENT_KEY = "MUL"
ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
DOCUMENT_NOTICE_ID_KEY = "ND"


class TedRequestAPI(RequestAPI):
Expand All @@ -40,15 +41,21 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
:return: dict
"""

response = requests.get(api_url, params=api_query)
response = requests.post(api_url, json=api_query)
try_again_request_count = 0
while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
try_again_request_count += 1
time.sleep(try_again_request_count * 0.1)
response = requests.post(api_url, json=api_query)
if try_again_request_count > 5:
break
if response.ok:
response_content = json.loads(response.text)
return response_content
else:
raise Exception(f"The TED-API call failed with: {response}")



class TedAPIAdapter(TedAPIAdapterABC):
"""
This class will fetch documents content
Expand All @@ -71,7 +78,7 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
:return: List[str]
"""

query = {"q": f"PD=[{wildcard_date}]"}
query = {"query": f"PD={wildcard_date}"}

return self.get_by_query(query=query)

Expand All @@ -83,48 +90,94 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
:return:List[str]
"""

date_filter = f">={start_date.strftime('%Y%m%d')} AND <={end_date.strftime('%Y%m%d')}"
date_filter = f"PD>={start_date.strftime('%Y%m%d')} AND PD<={end_date.strftime('%Y%m%d')}"

query = {"q": f"PD=[{date_filter}]"}
query = {"query": date_filter}

return self.get_by_query(query=query)

def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
def _retrieve_document_content(self, document_content: dict) -> str:
"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a test for this function?

Method to retrieve a document content from the TedApi API
:param document_content:
:return:str '
"""
xml_links = document_content[LINKS_TO_CONTENT_KEY][XML_CONTENT_KEY]
language_key = MULTIPLE_LANGUAGE_CONTENT_KEY
if language_key not in xml_links.keys():
if ENGLISH_LANGUAGE_CONTENT_KEY in xml_links.keys():
language_key = ENGLISH_LANGUAGE_CONTENT_KEY
else:
language_key = xml_links.keys()[0]

log_warning(
f"Language key {MULTIPLE_LANGUAGE_CONTENT_KEY} not found in {document_content[DOCUMENT_NOTICE_ID_KEY]},"
f" and will be used language key {language_key}!")

xml_document_content_link = xml_links[language_key]
response = requests.get(xml_document_content_link)
try_again_request_count = 0
while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
try_again_request_count += 1
time.sleep(try_again_request_count * 0.1)
response = requests.get(xml_document_content_link)
if try_again_request_count > 5:
break
if response.ok:
return response.text
else:
raise Exception(f"The notice content can't be loaded!: {response}, {response.content}")

def get_generator_by_query(self, query: dict, result_fields: dict = None, load_content: bool = True) -> Generator[
dict, None, None]:
"""
Method to get a documents content by passing a query to the API (json)
:param query:
:param result_fields:
:return:List[str]
:param load_content:
:return:Generator[dict]
"""
query.update(DEFAULT_TED_API_QUERY_RESULT_SIZE)
query.update(result_fields or DEFAULT_TED_API_QUERY_RESULT_FIELDS)
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)

documents_number = response_body[TOTAL_DOCUMENTS_NUMBER]
result_pages = 1 + int(documents_number) // 100
result_pages = 1 + int(documents_number) // DOCUMENTS_PER_PAGE
documents_content = response_body[RESPONSE_RESULTS]
if result_pages > 1:
for page_number in range(2, result_pages + 1):
query[RESULT_PAGE_NUMBER] = page_number
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]

for page_number in range(2, result_pages + 1):
query[RESULT_PAGE_NUMBER] = page_number
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]
if DOCUMENT_CONTENT_FIELD in query[TED_API_FIELDS]:
decoded_documents_content = []
for document_content in documents_content:
document_content[DOCUMENT_CONTENT] = base64.b64decode(document_content[DOCUMENT_CONTENT]).decode(
encoding="utf-8")
decoded_documents_content.append(document_content)
return decoded_documents_content
if load_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content
else:
return documents_content
for document_content in documents_content:
if load_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content

def get_by_query(self, query: dict, result_fields: dict = None, load_content: bool = True) -> List[dict]:
"""
Method to get a documents content by passing a query to the API (json)
:param query:
:param result_fields:
:param load_content:
:return:List[dict]
"""
return list(self.get_generator_by_query(query=query, result_fields=result_fields, load_content=load_content))

def get_by_id(self, document_id: str) -> dict:
"""
Method to get a document content by passing an ID
:param document_id:
:return: str
:return: dict
"""

query = {"q": f"ND=[{document_id}]"}
query = {"query": f"ND={document_id}"}

return self.get_by_query(query=query)[0]
5 changes: 4 additions & 1 deletion ted_sws/notice_fetcher/services/notice_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@ def _create_notice(self, notice_data: dict) -> Notice:
:param notice_data:
:return:
"""
xml_manifestation = XMLManifestation(object_data=notice_data["content"])
try:
xml_manifestation = XMLManifestation(object_data=notice_data["content"])
except Exception as e:
raise Exception(str(e), notice_data)
del notice_data["content"]
ted_id = notice_data["ND"]
original_metadata = TEDMetadata(**notice_data)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def validate_and_update_daily_supra_notice(ted_publication_date: day_type, mongo
fetched_notice_ids = set(fetched_notice_ids_list)

ted_api_adapter: TedAPIAdapter = TedAPIAdapter(request_api=request_api)
query = {"q": f"PD=[{ted_publication_date.strftime('%Y%m%d*')}]"}
documents = ted_api_adapter.get_by_query(query=query, result_fields={"fields": ["ND"]})
query = {"query": f"PD={ted_publication_date.strftime('%Y%m%d*')}"}
documents = ted_api_adapter.get_by_query(query=query, result_fields={"fields": ["ND"]}, load_content=False)
api_notice_ids_list = [document["ND"] for document in documents] if documents and len(documents) else []
api_notice_ids = set(api_notice_ids_list)

Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

@pytest.fixture
def notice_id():
return "067623-2022"
return "67623-2022"


@pytest.fixture
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/data_manager/test_mongodb_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def test_mongo_db_query_2():

def test_create_matview_for_notices(fake_mongodb_client):
notice_id = "696661-2022"
ted_api_query = {"q": f"ND=[{notice_id}]"}
ted_api_query = {"query": f"ND={notice_id}"}
mongodb_client = fake_mongodb_client
notice_repository = NoticeRepository(mongodb_client=mongodb_client)
NoticeFetcher(notice_repository=notice_repository,
Expand Down
8 changes: 4 additions & 4 deletions tests/e2e/data_manager/test_notice_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ def test_notice_repository_create(mongodb_client):
notice_repository = NoticeRepository(mongodb_client=mongodb_client, database_name=TEST_DATABASE_NAME)
notice = Notice(ted_id=NOTICE_TED_ID)
notice.set_xml_manifestation(XMLManifestation(object_data="HELLO"))
notice.set_original_metadata(TEDMetadata(**{"AA": ["Metadata"]}))
notice.set_original_metadata(TEDMetadata(**{"RN": ["Metadata"]}))
notice_repository.add(notice)
result_notice = notice_repository.get(reference=NOTICE_TED_ID)
assert result_notice
assert result_notice.ted_id == NOTICE_TED_ID
assert result_notice.original_metadata.AA == ["Metadata"]
assert result_notice.original_metadata.RN == ["Metadata"]
result_notices = list(notice_repository.list())
assert result_notices
assert len(result_notices) == 1
notice_repository.add(notice)
notice.set_original_metadata(ted_metadata=TEDMetadata(**{"AA": ["Updated metadata"]}))
notice.set_original_metadata(ted_metadata=TEDMetadata(**{"RN": ["Updated metadata"]}))
notice_repository.update(notice)
result_notice = notice_repository.get(reference=NOTICE_TED_ID)
assert result_notice
assert result_notice.ted_id == NOTICE_TED_ID
assert result_notice.original_metadata.AA == ["Updated metadata"]
assert result_notice.original_metadata.RN == ["Updated metadata"]
mongodb_client.drop_database(TEST_DATABASE_NAME)


Expand Down
40 changes: 40 additions & 0 deletions tests/e2e/notice_fetcher/_test_generate_eforms_sample_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pathlib

TED_API_EFORMS_QUERY = """
TD NOT IN (C E G I D P M Q O R 0 1 2 3 4 5 6 7 8 9 B S Y V F A H J K) AND
notice-subtype IN ({eforms_subtype}) AND
FT~"eforms-sdk-{eforms_sdk_version}"
"""

EFORMS_SUBTYPES = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
EFORMS_SDK_VERSIONS = [f"1.{version}" for version in range(3, 11)]


def _test_generate_eforms_sample_dataset(ted_document_search):
results_path = pathlib.Path(__file__).parent / "eforms_samples"

for eforms_sdk_version in EFORMS_SDK_VERSIONS:
for eforms_subtype in EFORMS_SUBTYPES:
results_dir_path = results_path / f"eforms_sdk_v{eforms_sdk_version}" / f"eform_subtype_{eforms_subtype}"

print(f"Load for {results_dir_path}")
query = {"query": TED_API_EFORMS_QUERY.format(eforms_sdk_version=eforms_sdk_version,
eforms_subtype=eforms_subtype)}
print(query)
notices = ted_document_search.get_generator_by_query(query=query)
for sample_id in range(1, 2):
notice = next(notices, None)
if notice is None:
break
results_dir_path.mkdir(parents=True, exist_ok=True)
result_notice_xml_path = results_dir_path / f"{notice['ND']}.xml"
result_notice_xml_path.write_text(notice["content"], encoding="utf-8")


def test_fetch_notice_by_id(ted_document_search):
notice_id = "067623-2022"
import json
notice_content = ted_document_search.get_by_id(document_id=notice_id)
result_notice_path = pathlib.Path(__file__).parent / "epo_notice.xml"
result_notice_path.write_text(json.dumps(notice_content), encoding="utf-8")

Loading
Loading