diff --git a/api/src/data_inclusion/api/inclusion_data/commands.py b/api/src/data_inclusion/api/inclusion_data/commands.py index f34ca8c08..2efc6b268 100644 --- a/api/src/data_inclusion/api/inclusion_data/commands.py +++ b/api/src/data_inclusion/api/inclusion_data/commands.py @@ -95,6 +95,9 @@ def validate_df(df: pd.DataFrame, model_schema) -> pd.DataFrame: def log_errors(errors_df: pd.DataFrame): + if errors_df.empty: + logger.info("no error") + return info_str = str( errors_df.groupby(["source", "errors.loc"])["_di_surrogate_id"] .count() @@ -138,15 +141,17 @@ def load_inclusion_data(): log_errors(service_errors_df) # exclude invalid data - structures_df = structures_df[ - ~structures_df._di_surrogate_id.isin(structure_errors_df._di_surrogate_id) - ] - services_df = services_df[ - ~services_df._di_surrogate_id.isin(service_errors_df._di_surrogate_id) - & ~services_df._di_structure_surrogate_id.isin( - structure_errors_df._di_surrogate_id - ) - ] + if not structure_errors_df.empty: + structures_df = structures_df[ + ~structures_df._di_surrogate_id.isin(structure_errors_df._di_surrogate_id) + ] + if not service_errors_df.empty: + services_df = services_df[ + ~services_df._di_surrogate_id.isin(service_errors_df._di_surrogate_id) + & ~services_df._di_structure_surrogate_id.isin( + structure_errors_df._di_surrogate_id + ) + ] structure_data_list = structures_df.to_dict(orient="records") service_data_list = services_df.to_dict(orient="records") diff --git a/pipeline/dags/dag_utils/sources/monenfant.py b/pipeline/dags/dag_utils/sources/monenfant.py index a56c1cd52..46819c3fd 100644 --- a/pipeline/dags/dag_utils/sources/monenfant.py +++ b/pipeline/dags/dag_utils/sources/monenfant.py @@ -247,7 +247,15 @@ def extract( logger.info("Extracting structures details...") for search_result in search_results: - data.append(extract_structure(search_result["organizationId"])) + structure_data = extract_structure(search_result["organizationId"]) + + # in rare cases, the structure data is not available and the response data + # is nearly empty. Discard such results. + if "resultId" not in structure_data: + logger.warning("Structure unavailable") + continue + + data.append(structure_data) return json.dumps(data).encode() diff --git a/pipeline/dags/dag_utils/sources/reseau_alpha.py b/pipeline/dags/dag_utils/sources/reseau_alpha.py index 11d4e77b5..e5900eeb1 100644 --- a/pipeline/dags/dag_utils/sources/reseau_alpha.py +++ b/pipeline/dags/dag_utils/sources/reseau_alpha.py @@ -87,13 +87,15 @@ def scrap_structure_html(html_path: Path) -> dict: ), "telephone": soup.select_one(".telephone > a"), "site_web": soup.select_one(".contact-content").find( - string=lambda t: t.startswith("http://") + string=lambda t: t.startswith("http") ), "courriel": soup.select_one(".email > a:nth-child(1)"), } for content_name, node in NODE_BY_CONTENT_NAME.items(): - data[f"content__{content_name}"] = utils.html_to_markdown(str(node)) + data[f"content__{content_name}"] = ( + utils.html_to_markdown(str(node)) if node is not None else None + ) return data