diff --git a/api/log.txt b/api/log.txt new file mode 100644 index 000000000..4da6b64c1 --- /dev/null +++ b/api/log.txt @@ -0,0 +1,14 @@ + _di_surrogate_id ... thematiques +0 agefiph-c7f4be8b-309e-4a6a-b562-c4f7f3bb3c5c ... [handicap] +1 france-travail-ARA0014 ... None +2 france-travail-ARA0018 ... None +3 france-travail-ARA0021 ... None +4 france-travail-ARA0024 ... None +... ... ... ... +23301 soliguide-9981 ... None +23302 soliguide-9983 ... None +23303 soliguide-9984 ... None +23304 soliguide-9995 ... None +23305 soliguide-9997 ... None + +[23306 rows x 29 columns] diff --git a/api/src/data_inclusion/api/inclusion_data/commands.py b/api/src/data_inclusion/api/inclusion_data/commands.py index f34ca8c08..9dafca365 100644 --- a/api/src/data_inclusion/api/inclusion_data/commands.py +++ b/api/src/data_inclusion/api/inclusion_data/commands.py @@ -10,11 +10,10 @@ from furl import furl from tqdm import tqdm -from data_inclusion import schema from data_inclusion.api.code_officiel_geo import constants from data_inclusion.api.config import settings from data_inclusion.api.core import db -from data_inclusion.api.inclusion_data import models +from data_inclusion.api.inclusion_data import models, schemas logger = logging.getLogger(__name__) @@ -95,6 +94,9 @@ def validate_df(df: pd.DataFrame, model_schema) -> pd.DataFrame: def log_errors(errors_df: pd.DataFrame): + if errors_df.empty: + logger.info("no error") + return info_str = str( errors_df.groupby(["source", "errors.loc"])["_di_surrogate_id"] .count() @@ -129,8 +131,8 @@ def load_inclusion_data(): code_insee=services_df.code_insee.apply(clean_up_code_insee) ) - structure_errors_df = validate_df(structures_df, model_schema=schema.Structure) - service_errors_df = validate_df(services_df, model_schema=schema.Service) + structure_errors_df = validate_df(structures_df, model_schema=schemas.Structure) + service_errors_df = validate_df(services_df, model_schema=schemas.Service) logger.info("Structure validation errors:") log_errors(structure_errors_df) @@ -138,15 +140,17 @@ def load_inclusion_data(): log_errors(service_errors_df) # exclude invalid data - structures_df = structures_df[ - ~structures_df._di_surrogate_id.isin(structure_errors_df._di_surrogate_id) - ] - services_df = services_df[ - ~services_df._di_surrogate_id.isin(service_errors_df._di_surrogate_id) - & ~services_df._di_structure_surrogate_id.isin( - structure_errors_df._di_surrogate_id - ) - ] + if not structure_errors_df.empty: + structures_df = structures_df[ + ~structures_df._di_surrogate_id.isin(structure_errors_df._di_surrogate_id) + ] + if not service_errors_df.empty: + services_df = services_df[ + ~services_df._di_surrogate_id.isin(service_errors_df._di_surrogate_id) + & ~services_df._di_structure_surrogate_id.isin( + structure_errors_df._di_surrogate_id + ) + ] structure_data_list = structures_df.to_dict(orient="records") service_data_list = services_df.to_dict(orient="records")