Skip to content

Commit

Permalink
feat(pipeline) : Persist geocoded results when score is high
Browse files Browse the repository at this point in the history
Also, get rid of the _di_geocodage_score that was misleading.
  • Loading branch information
vperron committed Aug 7, 2024
1 parent 3163e7d commit 635f016
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 30 deletions.
82 changes: 70 additions & 12 deletions pipeline/dbt/models/intermediate/int__union_adresses__enhanced.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ WITH adresses AS (
valid_adresses AS (
SELECT adresses.*
FROM adresses
LEFT JOIN LATERAL
LEFT JOIN
LATERAL
-- noqa: disable=references.qualification
LIST_ADRESSE_ERRORS(
adresse,
code_insee,
Expand All @@ -17,27 +19,83 @@ valid_adresses AS (
longitude,
source
) AS errors ON TRUE
-- noqa: enable=references.qualification
WHERE errors.field IS NULL
),

geocoded_results AS (
SELECT * FROM {{ ref('int_extra__geocoded_results') }}
),

final AS (
geocoded_addresses AS (

SELECT
valid_adresses._di_surrogate_id,
valid_adresses.id,
valid_adresses.source,
valid_adresses.complement_adresse,
geocoded_results.result_name AS adresse,
geocoded_results.longitude,
geocoded_results.latitude,
geocoded_results.result_city AS commune,
geocoded_results.result_postcode AS code_postal,
geocoded_results.result_citycode AS code_insee,
geocoded_results.result_score
FROM valid_adresses
LEFT JOIN geocoded_results ON valid_adresses._di_surrogate_id = geocoded_results._di_surrogate_id
WHERE geocoded_results.result_postcode != 'municipality' AND geocoded_results.result_score >= 0.8

),

geocoded_cities AS (

SELECT
{{
dbt_utils.star(
relation_alias='valid_adresses',
from=ref('int__union_adresses'),
except=['longitude', 'latitude'])
}},
geocoded_results.result_score,
geocoded_results.result_citycode,
COALESCE(valid_adresses.longitude, geocoded_results.longitude) AS "longitude",
COALESCE(valid_adresses.latitude, geocoded_results.latitude) AS "latitude"
valid_adresses._di_surrogate_id,
valid_adresses.id,
valid_adresses.source,
valid_adresses.complement_adresse,
valid_adresses.adresse,
geocoded_results.longitude,
geocoded_results.latitude,
geocoded_results.result_city AS commune,
geocoded_results.result_postcode AS code_postal,
geocoded_results.result_citycode AS code_insee,
geocoded_results.result_score
FROM valid_adresses
LEFT JOIN geocoded_results ON valid_adresses._di_surrogate_id = geocoded_results._di_surrogate_id
WHERE geocoded_results.result_postcode = 'municipality' AND geocoded_results.result_score >= 0.8

),

non_geocoded_addresses AS (
SELECT
valid_adresses._di_surrogate_id,
valid_adresses.id,
valid_adresses.source,
valid_adresses.complement_adresse,
valid_adresses.adresse,
CAST(valid_adresses.longitude AS FLOAT) AS longitude,
CAST(valid_adresses.latitude AS FLOAT) AS latitude,
valid_adresses.commune,
valid_adresses.code_postal,
/*
If there was a supplied INSEE code, keep it. If not, use the geocoded one,
knowing that it might be of poor quality. We need it within the services to
establish the diffusion zones.
*/
COALESCE(valid_adresses.code_insee, geocoded_results.result_citycode) AS code_insee,
geocoded_results.result_score
FROM valid_adresses
LEFT JOIN geocoded_results ON valid_adresses._di_surrogate_id = geocoded_results._di_surrogate_id
WHERE geocoded_results.result_score IS NULL OR geocoded_results.result_score < 0.8
),

final AS (
SELECT * FROM geocoded_addresses
UNION ALL
SELECT * FROM geocoded_cities
UNION ALL
SELECT * FROM non_geocoded_addresses
)

SELECT * FROM final
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ services_with_zone_diffusion AS (
SELECT
{{ dbt_utils.star(from=ref('int__union_services'), relation_alias='services', except=["zone_diffusion_code", "zone_diffusion_nom"]) }},
CASE
WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) THEN adresses.result_citycode
WHEN services.source = ANY(ARRAY['reseau-alpha', 'action-logement']) THEN LEFT(adresses.result_citycode, 2)
WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) THEN adresses.code_insee
WHEN services.source = ANY(ARRAY['reseau-alpha', 'action-logement']) THEN LEFT(adresses.code_insee, 2)
ELSE services.zone_diffusion_code
END AS "zone_diffusion_code",
CASE
WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) THEN adresses.commune
WHEN services.source = ANY(ARRAY['reseau-alpha', 'action-logement']) THEN (SELECT departements."LIBELLE" FROM departements WHERE departements."DEP" = LEFT(adresses.result_citycode, 2))
WHEN services.source = ANY(ARRAY['reseau-alpha', 'action-logement']) THEN (SELECT departements."LIBELLE" FROM departements WHERE departements."DEP" = LEFT(adresses.code_insee, 2))
WHEN services.source = 'mediation-numerique' THEN (SELECT departements."LIBELLE" FROM departements WHERE departements."DEP" = services.zone_diffusion_code)
ELSE services.zone_diffusion_nom
END AS "zone_diffusion_nom"
Expand All @@ -43,7 +43,9 @@ services_with_valid_structure AS (
valid_services AS (
SELECT services_with_valid_structure.*
FROM services_with_valid_structure
LEFT JOIN LATERAL
LEFT JOIN
LATERAL
-- noqa: disable=references.qualification
LIST_SERVICE_ERRORS(
contact_public,
contact_nom_prenom,
Expand Down Expand Up @@ -79,6 +81,7 @@ valid_services AS (
zone_diffusion_type,
pre_requis
) AS errors ON TRUE
-- noqa: enable=references.qualification
WHERE errors.field IS NULL
),

Expand All @@ -92,8 +95,7 @@ final AS (
adresses.adresse AS "adresse",
adresses.code_postal AS "code_postal",
adresses.code_insee AS "code_insee",
adresses.result_score AS "_di_geocodage_score",
adresses.result_citycode AS "_di_geocodage_code_insee"
adresses.result_score AS "_di_geocodage_score"
FROM
valid_services
LEFT JOIN adresses ON valid_services._di_adresse_surrogate_id = adresses._di_surrogate_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ adresses AS (
valid_structures AS (
SELECT structures.*
FROM structures
LEFT JOIN LATERAL
LEFT JOIN
LATERAL
-- noqa: disable=references.qualification
LIST_STRUCTURE_ERRORS(
accessibilite,
antenne,
Expand All @@ -35,6 +37,7 @@ valid_structures AS (
thematiques,
typologie
) AS errors ON TRUE
-- noqa: enable=references.qualification
WHERE errors.field IS NULL
),

Expand All @@ -49,7 +52,6 @@ final AS (
adresses.code_postal AS "code_postal",
adresses.code_insee AS "code_insee",
adresses.result_score AS "_di_geocodage_score",
adresses.result_citycode AS "_di_geocodage_code_insee",
COALESCE(plausible_personal_emails._di_surrogate_id IS NOT NULL, FALSE) AS "_di_email_is_pii"
FROM
valid_structures
Expand Down
10 changes: 0 additions & 10 deletions pipeline/dbt/models/marts/inclusion/_inclusion_models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ models:
data_type: text
constraints:
- type: primary_key
- name: _di_geocodage_code_insee
data_type: text
constraints:
- type: check
expression: '(CHECK_CODE_INSEE(_di_geocodage_code_insee))'
- name: _di_geocodage_score
data_type: float
- name: id
Expand Down Expand Up @@ -140,11 +135,6 @@ models:
- type: not_null
- type: foreign_key
expression: "public_marts.marts_inclusion__structures (_di_surrogate_id)"
- name: _di_geocodage_code_insee
data_type: text
constraints:
- type: check
expression: '(CHECK_CODE_INSEE(_di_geocodage_code_insee))'
- name: _di_geocodage_score
data_type: float
- name: id
Expand Down

0 comments on commit 635f016

Please sign in to comment.