From 56cbb7f74864b5f8670cc9cdfb3aa3c9e2bb43dc Mon Sep 17 00:00:00 2001 From: Dumitru Date: Tue, 3 Oct 2023 12:27:12 +0300 Subject: [PATCH] WIP --- dags/daily_notices_metadata_update.py | 48 +++++++++++++++++++ .../daily_notices_metadata_repository.py | 17 ++++++- .../daily_notices_metadata_services.py | 25 ++++++++-- .../test_notices_metadata_services.py | 12 ++++- 4 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 dags/daily_notices_metadata_update.py diff --git a/dags/daily_notices_metadata_update.py b/dags/daily_notices_metadata_update.py new file mode 100644 index 00000000..eefc94cf --- /dev/null +++ b/dags/daily_notices_metadata_update.py @@ -0,0 +1,48 @@ +""" +DAG to update daily notices metadata from TED. +""" + +from datetime import date + +from airflow.models import Param +from airflow.decorators import dag, task + +from dags import DEFAULT_DAG_ARGUMENTS +from dags.dags_utils import get_dag_param + +START_DATE_PARAM_KEY = "start_date" +END_DATE_PARAM_KEY = "end_date" + +@dag(default_args=DEFAULT_DAG_ARGUMENTS, + schedule_interval=None, + tags=['daily', "dashboards", "metadata", "ted", "notices"], + description=__doc__[0: __doc__.find(".")], + doc_md=__doc__, + params={ + START_DATE_PARAM_KEY: Param( + default=f"{date.today()}", + type="string", + format="date", + title="Start Date", + description="""This field is required. + Start date of the date range to fetch notices from TED.""" + ), + END_DATE_PARAM_KEY: Param( + default=f"{date.today()}", + type="string", + format="date", + title="End Date", + description="""This field is required. + End date of the date range to fetch notices from TED.""" + ) + } + ) +def daily_notices_metadata_update(): + @task + def update_daily_notices_metadata_from_ted(): + start_date = get_dag_param(key=START_DATE_PARAM_KEY, raise_error=True) + end_date = get_dag_param(key=END_DATE_PARAM_KEY, raise_error=True) + + update_daily_notices_metadata_from_ted(start_date=start_date, end_date=end_date) + + update_daily_notices_metadata_from_ted() \ No newline at end of file diff --git a/ted_sws/data_manager/adapters/daily_notices_metadata_repository.py b/ted_sws/data_manager/adapters/daily_notices_metadata_repository.py index 6a41d480..ff6d8562 100644 --- a/ted_sws/data_manager/adapters/daily_notices_metadata_repository.py +++ b/ted_sws/data_manager/adapters/daily_notices_metadata_repository.py @@ -1,5 +1,5 @@ -from datetime import datetime, time -from typing import Iterator, Optional +from datetime import datetime, time, date +from typing import Iterator, Optional, List from pymongo import MongoClient, ASCENDING @@ -90,3 +90,16 @@ def list(self) -> Iterator[DailyNoticesMetadata]: """ for result_dict in self.collection.find(): yield self._create_daily_notices_metadata_from_dict(daily_notices_metadata_dict=result_dict) + + def list_daily_notices_metadata_aggregation_date(self) -> List[date]: + """ + Gets all DailyNoticesMetadata ids from the repository. + :return: + """ + daily_notices_metadata_list = list(self.collection.find({}, + {DAILY_NOTICES_METADATA_AGGREGATION_DATE: 1, + DAILY_NOTICES_METADATA_ID: 0})) + if not daily_notices_metadata_list: + return [] + return [datetime.fromisoformat(aggregation_date[DAILY_NOTICES_METADATA_AGGREGATION_DATE]) for aggregation_date + in daily_notices_metadata_list] diff --git a/ted_sws/data_manager/services/daily_notices_metadata_services.py b/ted_sws/data_manager/services/daily_notices_metadata_services.py index a73f04a6..426e57d0 100644 --- a/ted_sws/data_manager/services/daily_notices_metadata_services.py +++ b/ted_sws/data_manager/services/daily_notices_metadata_services.py @@ -5,11 +5,19 @@ from pymongo import MongoClient from ted_sws import config +from ted_sws.core.model.supra_notice import DailyNoticesMetadata from ted_sws.data_manager.adapters.daily_notices_metadata_repository import DailyNoticesMetadataRepository from ted_sws.notice_fetcher.adapters.ted_api import TedAPIAdapter, TedRequestAPI -DEFAULT_TED_API_START_DATE = "2023-09-01" # TODO: Change to 2014-01-01 +DEFAULT_TED_API_START_DATE = "2023-09-29" # TODO: Change to 2014-01-01 DEFAULT_TED_API_START_DATE_FORMAT = "%Y-%m-%d" +TED_API_NOTICE_ID_FIELD = "ND" +TED_API_WILDCARD_DATE_FORMAT = "%Y%m%d*" +DAILY_NOTICES_METADATA_TED_API_QUERY_RESULT_FIELDS = {"fields": ["ND"]} +TED_API_QUERY_FIELD = "q" +DAILY_NOTICES_METADATA_TED_API_QUERY = { + TED_API_QUERY_FIELD: "PD=[{aggregation_date}]" +} def generate_list_of_dates_from_date_range(start_date: date, end_date: date) -> Optional[list]: @@ -48,8 +56,17 @@ def update_daily_notices_metadata_from_ted(start_date: date = None, date_range = generate_list_of_dates_from_date_range(start_date, end_date) # Getting from metadata repository dates that are not in the repository from date range - dates_not_in_repository = [day for day in date_range if not daily_notices_metadata_repo.get(day)] # TODO: Lazy evaluation + dates_not_in_repository = [day for day in date_range if + not daily_notices_metadata_repo.list_daily_notices_metadata_aggregation_date()] # Getting from TED API dates that are not in the repository from date range - #ted_api.get_by_query(query={"q": ""}) - + # TODO: If in ted are 0 notices, coverage is 1 to all + for day in dates_not_in_repository: + ted_api_query = DAILY_NOTICES_METADATA_TED_API_QUERY + ted_api_query[TED_API_QUERY_FIELD] = ted_api_query[TED_API_QUERY_FIELD].format( + aggregation_date=day.strftime(TED_API_WILDCARD_DATE_FORMAT)) + notice_ids = ted_api.get_by_query(ted_api_query, + result_fields=DAILY_NOTICES_METADATA_TED_API_QUERY_RESULT_FIELDS) + daily_notices_metadata = DailyNoticesMetadata(aggregation_date=day) + daily_notices_metadata.ted_api_notice_ids = [notice[TED_API_NOTICE_ID_FIELD] for notice in notice_ids] + daily_notices_metadata_repo.add(daily_notices_metadata) diff --git a/tests/unit/data_manager/services/test_notices_metadata_services.py b/tests/unit/data_manager/services/test_notices_metadata_services.py index 22c16535..1ece797f 100644 --- a/tests/unit/data_manager/services/test_notices_metadata_services.py +++ b/tests/unit/data_manager/services/test_notices_metadata_services.py @@ -1,3 +1,5 @@ +from datetime import date + from ted_sws import config from ted_sws.data_manager.adapters.daily_notices_metadata_repository import DailyNoticesMetadataRepository from ted_sws.data_manager.services.daily_notices_metadata_services import update_daily_notices_metadata_from_ted @@ -12,6 +14,14 @@ def test_update_daily_notices_metadata_from_ted(mongodb_client): ted_api = TedAPIAdapter(TedRequestAPI(), config.TED_API_URL) daily_notices_metadata_repo = DailyNoticesMetadataRepository(mongodb_client) - update_daily_notices_metadata_from_ted(ted_api=ted_api, + update_daily_notices_metadata_from_ted(start_date=date(2021, 1, 7), + end_date=date(2021, 1, 7), + ted_api=ted_api, mongo_client=mongodb_client, daily_notices_metadata_repo=daily_notices_metadata_repo) + + # update_daily_notices_metadata_from_ted(start_date=date(2021, 1, 7), + # end_date=date(2021, 1, 7), + # ted_api=ted_api, + # mongo_client=mongodb_client, + # daily_notices_metadata_repo=daily_notices_metadata_repo) \ No newline at end of file