diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql index 694c1f26..4bae773a 100644 --- a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql @@ -22,15 +22,15 @@ structure_adresses AS ( formation_adresses AS ( SELECT - _di_source_id AS "source", - adresses__longitude AS "longitude", - adresses__latitude AS "latitude", - NULL AS "complement_adresse", - adresses__ville AS "commune", - content__adresse AS "adresse", - adresses__code_postal AS "code_postal", - NULL AS "code_insee", - 'service--' || id AS "id" + _di_source_id AS "source", + adresses__longitude AS "longitude", + adresses__latitude AS "latitude", + NULL AS "complement_adresse", + adresses__ville AS "commune", + content__lieux_et_horaires_formation__adresse AS "adresse", + adresses__code_postal AS "code_postal", + NULL AS "code_insee", + 'service--' || id AS "id" FROM formations ), diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql index 1e508b88..e94a2b76 100644 --- a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql @@ -8,41 +8,59 @@ structures AS ( final AS ( SELECT - TRUE AS "contact_public", - formations.content__contact_prenom_nom AS "contact_nom_prenom", - formations.content__courriel AS "courriel", - NULL AS "formulaire_en_ligne", - NULL AS "frais_autres", - formations.nom AS "nom", - NULL AS "presentation_resume", - NULL AS "prise_rdv", - formations.content__horaires AS "recurrence", - formations._di_source_id AS "source", - formations.structure_id AS "structure_id", - formations.content__telephone AS "telephone", - NULL AS "zone_diffusion_code", -- FIXME - NULL AS "zone_diffusion_nom", - 'departement' AS "zone_diffusion_type", - TRUE AS "cumulable", - formations.url AS "lien_source", - formations.id AS "id", - formations.content__date_maj AS "date_maj", - NULL AS "modes_orientation_accompagnateur_autres", - NULL AS "modes_orientation_beneficiaire_autres", + TRUE AS "contact_public", + formations.content__contact_inscription__contact AS "contact_nom_prenom", + formations.content__contact_inscription__courriel AS "courriel", + formations.content__inscription__informations_en_ligne AS "formulaire_en_ligne", + NULL AS "frais_autres", + formations.nom AS "nom", + NULL AS "prise_rdv", + formations.content__lieux_et_horaires_formation__horaires AS "recurrence", + formations._di_source_id AS "source", + formations.structure_id AS "structure_id", + formations.content__contact_inscription__telephone AS "telephone", + NULL AS "zone_diffusion_code", + NULL AS "zone_diffusion_nom", -- FIXME + 'departement' AS "zone_diffusion_type", + TRUE AS "cumulable", + formations.url AS "lien_source", + formations.id AS "id", + formations.content__date_maj AS "date_maj", + NULL AS "modes_orientation_accompagnateur_autres", + NULL AS "modes_orientation_beneficiaire_autres", + CASE + WHEN LENGTH(formations.content__contenu_et_objectifs__titre) <= 280 + THEN formations.content__contenu_et_objectifs__titre + ELSE LEFT(formations.content__contenu_et_objectifs__titre, 279) || '…' + END AS "presentation_resume", ARRAY_TO_STRING( ARRAY[ - formations.content__contenu_et_objectifs, - formations.content__public_attendu, - formations.content__inscription, - formations.content__informations_pratiques + '# Contenu et objectifs de la formation', + formations.content__contenu_et_objectifs__titre, + formations.content__contenu_et_objectifs__objectifs, + formations.content__contenu_et_objectifs__niveau, + '# Public attendu', + formations.content__public_attendu__niveau, + formations.content__public_attendu__competences, + formations.content__public_attendu__type_de_public, + '# Inscription', + formations.content__inscription__places, + formations.content__inscription__entree_sortie, + '# Informations pratiques', + formations.content__informations_pratiques__etendue, + formations.content__informations_pratiques__volume, + formations.content__informations_pratiques__cout, + formations.content__informations_pratiques__prise_en_charge, + formations.content__informations_pratiques__remuneration, + formations.content__informations_pratiques__garde ], E'\n\n' - ) AS "presentation_detail", - 'service--' || formations.id AS "adresse_id", - CAST(NULL AS TEXT []) AS "justificatifs", - CAST(NULL AS TEXT []) AS "pre_requis", - CAST(NULL AS DATE) AS "date_suspension", - CAST(NULL AS DATE) AS "date_creation", + ) AS "presentation_detail", + 'service--' || formations.id AS "adresse_id", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "pre_requis", + CAST(NULL AS DATE) AS "date_suspension", + CAST(NULL AS DATE) AS "date_creation", ARRAY_REMOVE( ARRAY[ 'apprendre-francais--suivre-formation', @@ -50,13 +68,25 @@ final AS ( CASE WHEN formations.activite = 'Français à visée sociale et communicative' THEN 'apprendre-francais--communiquer-vie-tous-les-jours' END ], NULL - ) AS "thematiques", - ARRAY['en-presentiel'] AS "modes_accueil", - CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", - CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", - CAST(NULL AS TEXT []) AS "profils", - ARRAY['formation'] AS "types", - CAST(NULL AS TEXT []) AS "frais" + ) AS "thematiques", + ARRAY['en-presentiel'] AS "modes_accueil", + ARRAY_REMOVE( + ARRAY[ + CASE WHEN formations.content__contact_inscription__courriel IS NOT NULL THEN 'envoyer-un-mail' END, + CASE WHEN formations.content__contact_inscription__telephone IS NOT NULL THEN 'telephoner' END + ], + NULL + ) AS "modes_orientation_accompagnateur", + ARRAY_REMOVE( + ARRAY[ + CASE WHEN formations.content__contact_inscription__courriel IS NOT NULL THEN 'envoyer-un-mail' END, + CASE WHEN formations.content__contact_inscription__telephone IS NOT NULL THEN 'telephoner' END + ], + NULL + ) AS "modes_orientation_beneficiaire", + ARRAY['public-langues-etrangeres'] AS "profils", + ARRAY['formation'] AS "types", + CAST(NULL AS TEXT []) AS "frais" FROM formations LEFT JOIN structures ON formations.structure_id = structures.id ) diff --git a/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql index 760473e3..de753cc8 100644 --- a/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql +++ b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql @@ -5,13 +5,13 @@ WITH source AS ( adresses AS ( SELECT -- extracted from cartographie.json - source.data ->> 'id' AS "formation_id", - adresses.data ->> 'ville' AS "adresses__ville", - CAST(adresses.data ->> 'latitude' AS FLOAT) AS "adresses__latitude", - CAST(adresses.data ->> 'longitude' AS FLOAT) AS "adresses__longitude", - adresses.data ->> 'codePostal' AS "adresses__code_postal", - TRIM(SUBSTRING(source.data ->> 'content__adresse' FROM '^(.+)\s\d{5} - .+$')) AS "content__adresse", - TRIM(source.data ->> 'content__horaires') AS "content__horaires" + source.data ->> 'id' AS "formation_id", + adresses.data ->> 'ville' AS "adresses__ville", + CAST(adresses.data ->> 'latitude' AS FLOAT) AS "adresses__latitude", + CAST(adresses.data ->> 'longitude' AS FLOAT) AS "adresses__longitude", + adresses.data ->> 'codePostal' AS "adresses__code_postal", + TRIM(SUBSTRING(source.data ->> 'content__lieux_et_horaires_formation__adresse' FROM '^(.+)\s\d{5} - .+$')) AS "content__lieux_et_horaires_formation__adresse", + TRIM(source.data ->> 'content__lieux_et_horaires_formation__horaires') AS "content__lieux_et_horaires_formation__horaires" FROM source, LATERAL(SELECT * FROM JSONB_PATH_QUERY(source.data, '$.adresses[*]')) AS adresses (data) @@ -23,18 +23,18 @@ adresses AS ( final AS ( SELECT - source._di_source_id AS "_di_source_id", - adresses.adresses__ville AS "adresses__ville", - adresses.adresses__latitude AS "adresses__latitude", - adresses.adresses__longitude AS "adresses__longitude", - adresses.adresses__code_postal AS "adresses__code_postal", - adresses.content__adresse AS "content__adresse", - adresses.content__horaires AS "content__horaires", - source.data ->> 'id' AS "id", - source.data ->> 'structure_id' AS "structure_id", - source.data ->> 'nom' AS "nom", - source.data ->> 'url' AS "url", - source.data ->> 'activite' AS "activite", + source._di_source_id AS "_di_source_id", + adresses.adresses__ville AS "adresses__ville", + adresses.adresses__latitude AS "adresses__latitude", + adresses.adresses__longitude AS "adresses__longitude", + adresses.adresses__code_postal AS "adresses__code_postal", + adresses.content__lieux_et_horaires_formation__adresse AS "content__lieux_et_horaires_formation__adresse", + adresses.content__lieux_et_horaires_formation__horaires AS "content__lieux_et_horaires_formation__horaires", + source.data ->> 'id' AS "id", + source.data ->> 'structure_id' AS "structure_id", + source.data ->> 'nom' AS "nom", + source.data ->> 'url' AS "url", + source.data ->> 'activite' AS "activite", TO_DATE( SUBSTRING( ( @@ -56,14 +56,26 @@ final AS ( ) FROM 'Date de la dernière modification : (.*)' ), 'DD MM YYYY' - ) AS "content__date_maj", - TRIM(source.data ->> 'content__contenu_et_objectifs') AS "content__contenu_et_objectifs", - TRIM(source.data ->> 'content__public_attendu') AS "content__public_attendu", - TRIM(source.data ->> 'content__inscription') AS "content__inscription", - TRIM(source.data ->> 'content__contact_prenom_nom') AS "content__contact_prenom_nom", - TRIM(source.data ->> 'content__telephone') AS "content__telephone", - TRIM(source.data ->> 'content__courriel') AS "content__courriel", - TRIM(source.data ->> 'content__informations_pratiques') AS "content__informations_pratiques" + ) AS "content__date_maj", + TRIM(source.data ->> 'content__contenu_et_objectifs__titre') AS "content__contenu_et_objectifs__titre", + TRIM(source.data ->> 'content__contenu_et_objectifs__objectifs') AS "content__contenu_et_objectifs__objectifs", + TRIM(source.data ->> 'content__contenu_et_objectifs__niveau') AS "content__contenu_et_objectifs__niveau", + TRIM(source.data ->> 'content__public_attendu__niveau') AS "content__public_attendu__niveau", + TRIM(source.data ->> 'content__public_attendu__competences') AS "content__public_attendu__competences", + TRIM(source.data ->> 'content__public_attendu__type_de_public') AS "content__public_attendu__type_de_public", + TRIM(source.data ->> 'content__inscription__informations_en_ligne') AS "content__inscription__informations_en_ligne", + TRIM(source.data ->> 'content__inscription__places') AS "content__inscription__places", + TRIM(source.data ->> 'content__inscription__entree_sortie') AS "content__inscription__entree_sortie", + TRIM(source.data ->> 'content__contact_inscription__adresse') AS "content__contact_inscription__adresse", + TRIM(source.data ->> 'content__contact_inscription__contact') AS "content__contact_inscription__contact", + TRIM(source.data ->> 'content__contact_inscription__telephone') AS "content__contact_inscription__telephone", + TRIM(source.data ->> 'content__contact_inscription__courriel') AS "content__contact_inscription__courriel", + TRIM(source.data ->> 'content__informations_pratiques__etendue') AS "content__informations_pratiques__etendue", + TRIM(source.data ->> 'content__informations_pratiques__volume') AS "content__informations_pratiques__volume", + TRIM(source.data ->> 'content__informations_pratiques__cout') AS "content__informations_pratiques__cout", + TRIM(source.data ->> 'content__informations_pratiques__prise_en_charge') AS "content__informations_pratiques__prise_en_charge", + TRIM(source.data ->> 'content__informations_pratiques__remuneration') AS "content__informations_pratiques__remuneration", + TRIM(source.data ->> 'content__informations_pratiques__garde') AS "content__informations_pratiques__garde" FROM source LEFT JOIN adresses ON source.data ->> 'id' = adresses.formation_id ) diff --git a/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py b/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py index effbdeb8..672c702b 100644 --- a/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py +++ b/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py @@ -114,36 +114,131 @@ def scrap_structure_html(html_path: Path) -> dict: def scrap_formation_html(html_path: Path) -> dict: + def get_parent(node): + return node.parent if node is not None else None + with html_path.open() as f: soup = bs4.BeautifulSoup(f, features="lxml") data = {} + contenu_et_objectifs_selector = ( + ".container > .row:nth-child(2) > div:nth-child(1) > .row:nth-child(1)" + ) + public_attendu_selector = ( + ".container > .row:nth-child(2) > div:nth-child(1) > .row:nth-child(2)" + ) + inscription_selector = ( + ".container > .row:nth-child(2) > div:nth-child(2) > .row:nth-child(1)" + ) + informations_pratiques_selector = ( + ".container > .row:nth-child(2) > div:nth-child(2) > .row:nth-child(3)" + ) + NODE_BY_CONTENT_NAME = { - "contenu_et_objectifs": soup.select_one( - "div.container:nth-child(2) > div:nth-child(2)" - " > div:nth-child(1) > div:nth-child(1)" - ), "date_maj": soup.select_one(".entete").find( string=lambda text: "Date de la dernière modification :" in text ), - "public_attendu": soup.select_one( - "div.container:nth-child(2) > div:nth-child(2)" - " > div:nth-child(1) > div:nth-child(2)" + "contenu_et_objectifs__titre": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(2)" ), - "inscription": soup.select_one( - "div.col-lg-6:nth-child(2) > div:nth-child(1)" + "contenu_et_objectifs__objectifs": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(3)" ), - "contact_prenom_nom": soup.select_one( - "#formation-inscription > div:nth-child(2) > div:nth-child(2)" + "contenu_et_objectifs__niveau": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(4)" ), - "telephone": soup.select_one("#formation-inscription > div:nth-child(3)"), - "courriel": soup.select_one(".email > a:nth-child(1)"), - "informations_pratiques": soup.select_one( - "div.col-lg-6:nth-child(2) > div:nth-child(3)" + "public_attendu__niveau": soup.select_one( + f"{public_attendu_selector} > div:nth-child(2)" + ), + "public_attendu__competences": soup.select_one( + f"{public_attendu_selector} > div:nth-child(3)" + ), + "public_attendu__type_de_public": soup.select_one( + f"{public_attendu_selector} > div:nth-child(4)" + ), + "inscription__informations_en_ligne": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Informations en ligne" in text + ) + ) + ), + "inscription__places": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Places disponibles" in text + ) + ) + ), + "inscription__entree_sortie": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Entrée / sortie permanente" in text + ) + ) + ), + "contact_inscription__adresse": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-home")) + ), + "contact_inscription__contact": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-user")) + ), + "contact_inscription__telephone": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-phone")) + ), + "contact_inscription__courriel": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-inbox")) + ), + "informations_pratiques__etendue": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Étendue de la formation" in text + ) + ) + ), + "informations_pratiques__volume": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Volume horaire" in text + ) + ) + ), + "informations_pratiques__cout": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: ( + "Adhésion annuelle à la structure obligatoire" + ) + in text + ) + ) + ), + "informations_pratiques__prise_en_charge": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Coût d'inscription à la formation" in text + ) + ) + ), + "informations_pratiques__remuneration": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Rémunération" in text + ) + ) + ), + "informations_pratiques__garde": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Garde d'enfant" in text + ) + ) + ), + "lieux_et_horaires_formation__adresse": soup.select_one( + "#lieux-formation .lieu-formation .adresse" ), - "adresse": soup.select_one(".col-sm-9 > div:nth-child(2)"), - "horaires": "".join( - soup.select_one(".col-sm-9").find_all( + "lieux_et_horaires_formation__horaires": "\n".join( + soup.select_one("#lieux-formation").find_all( string=lambda text: "de" in text and "à" in text ) ),