Skip to content

Commit

Permalink
Merge branch 'main' into feat/pipeline/fix-char-mon-enfant
Browse files Browse the repository at this point in the history
  • Loading branch information
hlecuyer authored Jul 9, 2024
2 parents edb85db + c59e78f commit 47283a2
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 12 deletions.
23 changes: 14 additions & 9 deletions api/src/data_inclusion/api/inclusion_data/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ def validate_df(df: pd.DataFrame, model_schema) -> pd.DataFrame:


def log_errors(errors_df: pd.DataFrame):
if errors_df.empty:
logger.info("no error")
return
info_str = str(
errors_df.groupby(["source", "errors.loc"])["_di_surrogate_id"]
.count()
Expand Down Expand Up @@ -138,15 +141,17 @@ def load_inclusion_data():
log_errors(service_errors_df)

# exclude invalid data
structures_df = structures_df[
~structures_df._di_surrogate_id.isin(structure_errors_df._di_surrogate_id)
]
services_df = services_df[
~services_df._di_surrogate_id.isin(service_errors_df._di_surrogate_id)
& ~services_df._di_structure_surrogate_id.isin(
structure_errors_df._di_surrogate_id
)
]
if not structure_errors_df.empty:
structures_df = structures_df[
~structures_df._di_surrogate_id.isin(structure_errors_df._di_surrogate_id)
]
if not service_errors_df.empty:
services_df = services_df[
~services_df._di_surrogate_id.isin(service_errors_df._di_surrogate_id)
& ~services_df._di_structure_surrogate_id.isin(
structure_errors_df._di_surrogate_id
)
]

structure_data_list = structures_df.to_dict(orient="records")
service_data_list = services_df.to_dict(orient="records")
Expand Down
10 changes: 9 additions & 1 deletion pipeline/dags/dag_utils/sources/monenfant.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,15 @@ def extract(

logger.info("Extracting structures details...")
for search_result in search_results:
data.append(extract_structure(search_result["organizationId"]))
structure_data = extract_structure(search_result["organizationId"])

# in rare cases, the structure data is not available and the response data
# is nearly empty. Discard such results.
if "resultId" not in structure_data:
logger.warning("Structure unavailable")
continue

data.append(structure_data)

return json.dumps(data).encode()

Expand Down
6 changes: 4 additions & 2 deletions pipeline/dags/dag_utils/sources/reseau_alpha.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,15 @@ def scrap_structure_html(html_path: Path) -> dict:
),
"telephone": soup.select_one(".telephone > a"),
"site_web": soup.select_one(".contact-content").find(
string=lambda t: t.startswith("http://")
string=lambda t: t.startswith("http")
),
"courriel": soup.select_one(".email > a:nth-child(1)"),
}

for content_name, node in NODE_BY_CONTENT_NAME.items():
data[f"content__{content_name}"] = utils.html_to_markdown(str(node))
data[f"content__{content_name}"] = (
utils.html_to_markdown(str(node)) if node is not None else None
)

return data

Expand Down

0 comments on commit 47283a2

Please sign in to comment.