Skip to content

Commit

Permalink
feat(pipeline) : Ensure the zone_diffusion_codes for DROM/COM
Browse files Browse the repository at this point in the history
There were cases where we did have the zone_diffusion_code set to "97"
which can't be right, as it would mean that a given service coyuld be
available across the oceans.

Let's fix it and set the correct, 3-digit department number.

This will also enable their search as we now (since the "new" communes)
search for a match agains commune.departement, which can be 3 digits.

There is also now a complete data validation that leaves the errors as a
specific table in the public_dbt_test__audit schema.
  • Loading branch information
vperron authored and vmttn committed Sep 19, 2024
1 parent 34bcac0 commit a4bcc01
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 57 deletions.
35 changes: 35 additions & 0 deletions pipeline/dbt/models/intermediate/_models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,41 @@ models:
A service belonging to a structure data failing validation is considered invalid.
columns:
- name: zone_diffusion_code
data_tests:
- not_null:
config:
severity: warn
- dbt_utils.not_empty_string
- dbt_utils.not_constant
- relationships:
to: ref('stg_decoupage_administratif__regions')
field: code
where: "zone_diffusion_type = 'region'"
# TODO(vmttn): prevent false codes from being propagated downstream
# and set back severity to error
config:
severity: warn
- relationships:
to: ref('stg_decoupage_administratif__departements')
field: code
where: "zone_diffusion_type = 'departement'"
config:
severity: warn
- relationships:
to: ref('stg_decoupage_administratif__epcis')
field: code
where: "zone_diffusion_type = 'epci'"
config:
severity: warn
- relationships:
to: ref('stg_decoupage_administratif__communes')
field: code
where: "zone_diffusion_type = 'commune'"
config:
severity: warn

- name: int__union_structures__enhanced
description: |
All valid structures, with extra data:
Expand Down
155 changes: 98 additions & 57 deletions pipeline/dbt/models/intermediate/int__union_services__enhanced.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,86 +6,126 @@ structures AS (
SELECT * FROM {{ ref('int__union_structures__enhanced') }}
),

departements AS (
SELECT * FROM {{ ref('stg_decoupage_administratif__departements') }}
),

adresses AS (
SELECT * FROM {{ ref('int__union_adresses__enhanced') }}
),

departements AS (
SELECT * FROM {{ ref('stg_decoupage_administratif__departements') }}
adresses_with_code_departement AS (
SELECT
adresses.*,
CASE
WHEN LEFT(adresses.code_insee, 2) = '97' THEN LEFT(adresses.code_insee, 3)
ELSE LEFT(adresses.code_insee, 2)
END AS "code_departement"
FROM adresses
),

-- TODO: Refactoring needed to be able to do geocoding per source and then use the result in the mapping
services_with_zone_diffusion AS (
services_with_valid_structure AS (
SELECT services.*
FROM services
INNER JOIN structures
ON services._di_structure_surrogate_id = structures._di_surrogate_id
),

-- For some providers, zone_diffusion_code can not be set at the source mapping level for lack of proper codification.
-- Now that the data has been geocoded, it can be set, according to the mapped zone_diffusion_type.
-- FIXME(vperron) : ODSPEP services have such a catastrophic adress columns quality
-- that trying to reuse them for the zone diffusion makes the situation worse.
zones_diffusion AS (
SELECT
{{ dbt_utils.star(from=ref('int__union_services'), relation_alias='services', except=["zone_diffusion_code", "zone_diffusion_nom"]) }},
services._di_surrogate_id AS "_di_surrogate_id",
services.zone_diffusion_type AS "zone_diffusion_type",
CASE
WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) THEN adresses.code_insee
WHEN services.source = ANY(ARRAY['reseau-alpha', 'action-logement']) THEN LEFT(adresses.code_insee, 2)
WHEN NOT (services.source = ANY(ARRAY['monenfant', 'action-logement', 'soliguide', 'reseau-alpha', 'mediation-numerique']))
THEN services.zone_diffusion_code
WHEN services.zone_diffusion_type = 'communes' AND adresses.code_insee IS NOT NULL
THEN adresses.code_insee
WHEN services.zone_diffusion_type = 'departement' AND adresses.code_departement IS NOT NULL
THEN adresses.code_departement
ELSE services.zone_diffusion_code
END AS "zone_diffusion_code",
END AS "zone_diffusion_code",
CASE
WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) THEN adresses.commune
WHEN services.source = ANY(ARRAY['reseau-alpha', 'action-logement']) THEN (SELECT departements."nom" FROM departements WHERE departements."code" = LEFT(adresses.code_insee, 2))
WHEN services.source = 'mediation-numerique' THEN (SELECT departements."nom" FROM departements WHERE departements."code" = services.zone_diffusion_code)
WHEN NOT (services.source = ANY(ARRAY['monenfant', 'action-logement', 'soliguide', 'reseau-alpha', 'mediation-numerique']))
THEN services.zone_diffusion_nom
WHEN services.zone_diffusion_type = 'communes' AND adresses.commune IS NOT NULL
THEN adresses.commune
WHEN services.zone_diffusion_type = 'departement' AND departements.nom IS NOT NULL
THEN departements.nom
ELSE services.zone_diffusion_nom
END AS "zone_diffusion_nom"
FROM
services
LEFT JOIN adresses ON services._di_adresse_surrogate_id = adresses._di_surrogate_id
END AS "zone_diffusion_nom"
FROM services_with_valid_structure AS services
LEFT JOIN adresses_with_code_departement AS adresses
ON services._di_adresse_surrogate_id = adresses._di_surrogate_id
LEFT JOIN departements
ON adresses.code_departement = departements.code
),

services_with_valid_structure AS (
SELECT services_with_zone_diffusion.*
FROM services_with_zone_diffusion
INNER JOIN structures ON services_with_zone_diffusion._di_structure_surrogate_id = structures._di_surrogate_id
services_with_zone_diffusion AS (
SELECT
{{
dbt_utils.star(
from=ref('int__union_services'),
relation_alias='services',
except=["zone_diffusion_code", "zone_diffusion_nom"]
)
}},
zones_diffusion.zone_diffusion_code AS "zone_diffusion_code",
zones_diffusion.zone_diffusion_nom AS "zone_diffusion_nom"
FROM services
LEFT JOIN zones_diffusion
ON services._di_surrogate_id = zones_diffusion._di_surrogate_id
),

valid_services AS (
SELECT services_with_valid_structure.*
FROM services_with_valid_structure
SELECT services.*
FROM services_with_zone_diffusion AS services
LEFT JOIN
LATERAL
LIST_SERVICE_ERRORS(
services_with_valid_structure.contact_public,
services_with_valid_structure.contact_nom_prenom,
services_with_valid_structure.courriel,
services_with_valid_structure.cumulable,
services_with_valid_structure.date_creation,
services_with_valid_structure.date_maj,
services_with_valid_structure.date_suspension,
services_with_valid_structure.frais,
services_with_valid_structure.frais_autres,
services_with_valid_structure.id,
services_with_valid_structure.justificatifs,
services_with_valid_structure.lien_source,
services_with_valid_structure.modes_accueil,
services_with_valid_structure.modes_orientation_accompagnateur,
services_with_valid_structure.modes_orientation_accompagnateur_autres,
services_with_valid_structure.modes_orientation_beneficiaire,
services_with_valid_structure.modes_orientation_beneficiaire_autres,
services_with_valid_structure.nom,
services_with_valid_structure.page_web,
services_with_valid_structure.presentation_detail,
services_with_valid_structure.presentation_resume,
services_with_valid_structure.prise_rdv,
services_with_valid_structure.profils,
services_with_valid_structure.recurrence,
services_with_valid_structure.source,
services_with_valid_structure.structure_id,
services_with_valid_structure.telephone,
services_with_valid_structure.thematiques,
services_with_valid_structure.types,
services_with_valid_structure.zone_diffusion_code,
services_with_valid_structure.zone_diffusion_nom,
services_with_valid_structure.zone_diffusion_type,
services_with_valid_structure.pre_requis
services.contact_public,
services.contact_nom_prenom,
services.courriel,
services.cumulable,
services.date_creation,
services.date_maj,
services.date_suspension,
services.frais,
services.frais_autres,
services.id,
services.justificatifs,
services.lien_source,
services.modes_accueil,
services.modes_orientation_accompagnateur,
services.modes_orientation_accompagnateur_autres,
services.modes_orientation_beneficiaire,
services.modes_orientation_beneficiaire_autres,
services.nom,
services.page_web,
services.presentation_detail,
services.presentation_resume,
services.prise_rdv,
services.profils,
services.recurrence,
services.source,
services.structure_id,
services.telephone,
services.thematiques,
services.types,
services.zone_diffusion_code,
services.zone_diffusion_nom,
services.zone_diffusion_type,
services.pre_requis
) AS errors ON TRUE
WHERE errors.field IS NULL
),

final AS (
SELECT
valid_services.*,
services.*,
adresses.longitude AS "longitude",
adresses.latitude AS "latitude",
adresses.complement_adresse AS "complement_adresse",
Expand All @@ -94,8 +134,9 @@ final AS (
adresses.code_postal AS "code_postal",
adresses.code_insee AS "code_insee"
FROM
valid_services
LEFT JOIN adresses ON valid_services._di_adresse_surrogate_id = adresses._di_surrogate_id
valid_services AS services
LEFT JOIN adresses_with_code_departement AS adresses
ON services._di_adresse_surrogate_id = adresses._di_surrogate_id
)

SELECT * FROM final
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ models:
- dbt_utils.not_constant
- dbt_utils.not_empty_string

- name: stg_decoupage_administratif__epcis
columns:
- name: code
data_tests:
- not_null
- dbt_utils.not_constant
- dbt_utils.not_empty_string
- name: nom
data_tests:
- not_null
- dbt_utils.not_constant
- dbt_utils.not_empty_string

- name: stg_decoupage_administratif__communes
columns:
- name: code
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
WITH source AS (
{{ stg_source_header('decoupage_administratif', 'epcis') }}
),

final AS (
SELECT
code AS "code",
nom AS "nom"
FROM source
ORDER BY code
)

SELECT * FROM final

0 comments on commit a4bcc01

Please sign in to comment.