From e5524acdc44ac4de70928c3b70be38421669001d Mon Sep 17 00:00:00 2001 From: Hugo Lecuyer Date: Mon, 1 Jul 2024 18:15:44 +0200 Subject: [PATCH] fix(api): fix error if no error and fix schema validation --- .../api/inclusion_data/commands.py | 23 +++++++++++-------- .../dags/dag_utils/sources/reseau_alpha.py | 6 +++-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/api/src/data_inclusion/api/inclusion_data/commands.py b/api/src/data_inclusion/api/inclusion_data/commands.py index f34ca8c0..2efc6b26 100644 --- a/api/src/data_inclusion/api/inclusion_data/commands.py +++ b/api/src/data_inclusion/api/inclusion_data/commands.py @@ -95,6 +95,9 @@ def validate_df(df: pd.DataFrame, model_schema) -> pd.DataFrame: def log_errors(errors_df: pd.DataFrame): + if errors_df.empty: + logger.info("no error") + return info_str = str( errors_df.groupby(["source", "errors.loc"])["_di_surrogate_id"] .count() @@ -138,15 +141,17 @@ def load_inclusion_data(): log_errors(service_errors_df) # exclude invalid data - structures_df = structures_df[ - ~structures_df._di_surrogate_id.isin(structure_errors_df._di_surrogate_id) - ] - services_df = services_df[ - ~services_df._di_surrogate_id.isin(service_errors_df._di_surrogate_id) - & ~services_df._di_structure_surrogate_id.isin( - structure_errors_df._di_surrogate_id - ) - ] + if not structure_errors_df.empty: + structures_df = structures_df[ + ~structures_df._di_surrogate_id.isin(structure_errors_df._di_surrogate_id) + ] + if not service_errors_df.empty: + services_df = services_df[ + ~services_df._di_surrogate_id.isin(service_errors_df._di_surrogate_id) + & ~services_df._di_structure_surrogate_id.isin( + structure_errors_df._di_surrogate_id + ) + ] structure_data_list = structures_df.to_dict(orient="records") service_data_list = services_df.to_dict(orient="records") diff --git a/pipeline/dags/dag_utils/sources/reseau_alpha.py b/pipeline/dags/dag_utils/sources/reseau_alpha.py index 11d4e77b..e5900eeb 100644 --- a/pipeline/dags/dag_utils/sources/reseau_alpha.py +++ b/pipeline/dags/dag_utils/sources/reseau_alpha.py @@ -87,13 +87,15 @@ def scrap_structure_html(html_path: Path) -> dict: ), "telephone": soup.select_one(".telephone > a"), "site_web": soup.select_one(".contact-content").find( - string=lambda t: t.startswith("http://") + string=lambda t: t.startswith("http") ), "courriel": soup.select_one(".email > a:nth-child(1)"), } for content_name, node in NODE_BY_CONTENT_NAME.items(): - data[f"content__{content_name}"] = utils.html_to_markdown(str(node)) + data[f"content__{content_name}"] = ( + utils.html_to_markdown(str(node)) if node is not None else None + ) return data