From b9cac0cc81ea01293b49c8b83d8550a9dd40c291 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Fri, 27 Sep 2024 09:53:20 +0200 Subject: [PATCH] fix(dora): remove empty strings from all fields Also: * rewrite casts to use standard sql `CAST`. * remove redundant text array casts. * remove useless thematiques column. * enforce relationships between tables. * move source config next to staging models. --- pipeline/dbt/models/_sources.yml | 17 ---- .../staging/sources/dora/_dora__models.yml | 9 -- .../staging/sources/dora/_dora__sources.yml | 14 ++++ .../sources/dora/stg_dora__services.sql | 82 +++++++++---------- .../sources/dora/stg_dora__structures.sql | 53 ++++++------ 5 files changed, 81 insertions(+), 94 deletions(-) create mode 100644 pipeline/dbt/models/staging/sources/dora/_dora__sources.yml diff --git a/pipeline/dbt/models/_sources.yml b/pipeline/dbt/models/_sources.yml index ceb38122d..bba32d27e 100644 --- a/pipeline/dbt/models/_sources.yml +++ b/pipeline/dbt/models/_sources.yml @@ -1,11 +1,6 @@ version: 2 sources: - - name: internal - schema: public - tables: - - name: extra__geocoded_results - - name: data_inclusion schema: data_inclusion meta: @@ -27,18 +22,6 @@ sources: - name: sirene_etablissement_historique - name: sirene_etablissement_succession - - name: dora - schema: dora - meta: - is_provider: true - tables: - - name: structures - meta: - kind: structure - - name: services - meta: - kind: service - - name: france_travail schema: france_travail meta: diff --git a/pipeline/dbt/models/staging/sources/dora/_dora__models.yml b/pipeline/dbt/models/staging/sources/dora/_dora__models.yml index 2aff34436..894cf4ca0 100644 --- a/pipeline/dbt/models/staging/sources/dora/_dora__models.yml +++ b/pipeline/dbt/models/staging/sources/dora/_dora__models.yml @@ -135,10 +135,6 @@ models: - relationships: to: ref('stg_dora__structures') field: id - # Some services are associated to draft structures which are not published - # by dora on its api. Therefore some services are missing their structure. - config: - severity: warn - name: telephone data_tests: - dbt_utils.not_constant @@ -281,11 +277,6 @@ models: data_tests: - dbt_utils.not_constant - dbt_utils.not_empty_string - - name: thematiques - data_tests: - # dora's thematiques are defined at the service level - - dbt_utils.expression_is_true: - expression: "IS NULL" - name: typologie data_tests: - dbt_utils.not_constant diff --git a/pipeline/dbt/models/staging/sources/dora/_dora__sources.yml b/pipeline/dbt/models/staging/sources/dora/_dora__sources.yml new file mode 100644 index 000000000..7735d8cb5 --- /dev/null +++ b/pipeline/dbt/models/staging/sources/dora/_dora__sources.yml @@ -0,0 +1,14 @@ +version: 2 + +sources: + - name: dora + schema: dora + meta: + is_provider: true + tables: + - name: structures + meta: + kind: structure + - name: services + meta: + kind: service \ No newline at end of file diff --git a/pipeline/dbt/models/staging/sources/dora/stg_dora__services.sql b/pipeline/dbt/models/staging/sources/dora/stg_dora__services.sql index 2254a3809..c15573ac2 100644 --- a/pipeline/dbt/models/staging/sources/dora/stg_dora__services.sql +++ b/pipeline/dbt/models/staging/sources/dora/stg_dora__services.sql @@ -8,47 +8,47 @@ structures AS ( services AS ( SELECT - _di_source_id AS "_di_source_id", - (data ->> 'contact_public')::BOOLEAN AS "contact_public", - (data ->> 'cumulable')::BOOLEAN AS "cumulable", - (data ->> 'date_creation')::TIMESTAMP WITH TIME ZONE AS "date_creation", - (data ->> 'date_maj')::TIMESTAMP WITH TIME ZONE AS "date_maj", - (data ->> 'date_suspension')::TIMESTAMP WITH TIME ZONE AS "date_suspension", - (data ->> 'latitude')::FLOAT AS "latitude", - (data ->> 'longitude')::FLOAT AS "longitude", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_accueil'))::TEXT [] AS "modes_accueil", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'justificatifs'))::TEXT [] AS "justificatifs", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'pre_requis'))::TEXT [] AS "pre_requis", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_accompagnateur'))::TEXT [] AS "modes_orientation_accompagnateur", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_beneficiaire'))::TEXT [] AS "modes_orientation_beneficiaire", - data ->> 'modes_orientation_accompagnateur_autres' AS "modes_orientation_accompagnateur_autres", - data ->> 'modes_orientation_beneficiaire_autres' AS "modes_orientation_beneficiaire_autres", - data ->> 'adresse' AS "adresse", - data ->> 'code_insee' AS "code_insee", - data ->> 'code_postal' AS "code_postal", - data ->> 'commune' AS "commune", - data ->> 'complement_adresse' AS "complement_adresse", - NULLIF(TRIM(data ->> 'contact_nom_prenom'), '') AS "contact_nom_prenom", - NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel", - data ->> 'formulaire_en_ligne' AS "formulaire_en_ligne", - data ->> 'frais_autres' AS "frais_autres", - data ->> 'frais' AS "frais", - data ->> 'id' AS "id", - data ->> 'lien_source' AS "lien_source", - data ->> 'nom' AS "nom", - data ->> 'presentation_resume' AS "presentation_resume", - data ->> 'presentation_detail' AS "presentation_detail", - NULLIF(TRIM(data ->> 'prise_rdv'), '') AS "prise_rdv", - data ->> 'recurrence' AS "recurrence", - data ->> 'source' AS "source", - data ->> 'structure_id' AS "structure_id", - NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone", - NULLIF(TRIM(data ->> 'zone_diffusion_code'), '') AS "zone_diffusion_code", - NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom", - data ->> 'zone_diffusion_type' AS "zone_diffusion_type" + _di_source_id AS "_di_source_id", + CAST((data ->> 'contact_public') AS BOOLEAN) AS "contact_public", + CAST((data ->> 'cumulable') AS BOOLEAN) AS "cumulable", + CAST((data ->> 'date_creation') AS TIMESTAMP WITH TIME ZONE) AS "date_creation", + CAST((data ->> 'date_maj') AS TIMESTAMP WITH TIME ZONE) AS "date_maj", + CAST((data ->> 'date_suspension') AS TIMESTAMP WITH TIME ZONE) AS "date_suspension", + CAST((data ->> 'latitude') AS FLOAT) AS "latitude", + CAST((data ->> 'longitude') AS FLOAT) AS "longitude", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_accueil')) AS "modes_accueil", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils')) AS "profils", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques')) AS "thematiques", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types')) AS "types", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'justificatifs')) AS "justificatifs", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'pre_requis')) AS "pre_requis", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_accompagnateur')) AS "modes_orientation_accompagnateur", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_beneficiaire')) AS "modes_orientation_beneficiaire", + NULLIF(TRIM(data ->> 'modes_orientation_accompagnateur_autres'), '') AS "modes_orientation_accompagnateur_autres", + NULLIF(TRIM(data ->> 'modes_orientation_beneficiaire_autres'), '') AS "modes_orientation_beneficiaire_autres", + NULLIF(TRIM(data ->> 'adresse'), '') AS "adresse", + NULLIF(TRIM(data ->> 'code_insee'), '') AS "code_insee", + NULLIF(TRIM(data ->> 'code_postal'), '') AS "code_postal", + NULLIF(TRIM(data ->> 'commune'), '') AS "commune", + NULLIF(TRIM(data ->> 'complement_adresse'), '') AS "complement_adresse", + NULLIF(TRIM(data ->> 'contact_nom_prenom'), '') AS "contact_nom_prenom", + NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel", + NULLIF(TRIM(data ->> 'formulaire_en_ligne'), '') AS "formulaire_en_ligne", + NULLIF(TRIM(data ->> 'frais_autres'), '') AS "frais_autres", + NULLIF(TRIM(data ->> 'frais'), '') AS "frais", + NULLIF(TRIM(data ->> 'id'), '') AS "id", + NULLIF(TRIM(data ->> 'lien_source'), '') AS "lien_source", + NULLIF(TRIM(data ->> 'nom'), '') AS "nom", + NULLIF(TRIM(data ->> 'presentation_resume'), '') AS "presentation_resume", + NULLIF(TRIM(data ->> 'presentation_detail'), '') AS "presentation_detail", + NULLIF(TRIM(data ->> 'prise_rdv'), '') AS "prise_rdv", + NULLIF(TRIM(data ->> 'recurrence'), '') AS "recurrence", + NULLIF(TRIM(data ->> 'source'), '') AS "source", + NULLIF(TRIM(data ->> 'structure_id'), '') AS "structure_id", + NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone", + NULLIF(TRIM(data ->> 'zone_diffusion_code'), '') AS "zone_diffusion_code", + NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom", + NULLIF(TRIM(data ->> 'zone_diffusion_type'), '') AS "zone_diffusion_type" FROM source ), diff --git a/pipeline/dbt/models/staging/sources/dora/stg_dora__structures.sql b/pipeline/dbt/models/staging/sources/dora/stg_dora__structures.sql index 5c510e587..7e0e6b0f9 100644 --- a/pipeline/dbt/models/staging/sources/dora/stg_dora__structures.sql +++ b/pipeline/dbt/models/staging/sources/dora/stg_dora__structures.sql @@ -4,33 +4,32 @@ WITH source AS ( final AS ( SELECT - _di_source_id AS "_di_source_id", - (data ->> 'antenne')::BOOLEAN AS "antenne", - (data ->> 'date_maj')::TIMESTAMP WITH TIME ZONE AS "date_maj", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'labels_autres'))::TEXT [] AS "labels_autres", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'labels_nationaux'))::TEXT [] AS "labels_nationaux", - (data ->> 'latitude')::FLOAT AS "latitude", - (data ->> 'longitude')::FLOAT AS "longitude", - (data ->> 'thematiques')::TEXT [] AS "thematiques", - data ->> 'accessibilite' AS "accessibilite", - data ->> 'adresse' AS "adresse", - data ->> 'code_insee' AS "code_insee", - data ->> 'code_postal' AS "code_postal", - data ->> 'commune' AS "commune", - NULLIF(TRIM(data ->> 'complement_adresse'), '') AS "complement_adresse", - NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel", - data ->> 'horaires_ouverture' AS "horaires_ouverture", - data ->> 'id' AS "id", - data ->> 'lien_source' AS "lien_source", - NULLIF(TRIM(data ->> 'nom'), '') AS "nom", - data ->> 'presentation_detail' AS "presentation_detail", - data ->> 'presentation_resume' AS "presentation_resume", - data ->> 'rna' AS "rna", - data ->> 'siret' AS "siret", - data ->> 'site_web' AS "site_web", - data ->> 'source' AS "source", - data ->> 'telephone' AS "telephone", - NULLIF(TRIM(data ->> 'typologie'), '') AS "typologie" + _di_source_id AS "_di_source_id", + CAST((data ->> 'antenne') AS BOOLEAN) AS "antenne", + CAST((data ->> 'date_maj') AS TIMESTAMP WITH TIME ZONE) AS "date_maj", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'labels_autres')) AS "labels_autres", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'labels_nationaux')) AS "labels_nationaux", + CAST((data ->> 'latitude') AS FLOAT) AS "latitude", + CAST((data ->> 'longitude') AS FLOAT) AS "longitude", + NULLIF(TRIM(data ->> 'accessibilite'), '') AS "accessibilite", + NULLIF(TRIM(data ->> 'adresse'), '') AS "adresse", + NULLIF(TRIM(data ->> 'code_insee'), '') AS "code_insee", + NULLIF(TRIM(data ->> 'code_postal'), '') AS "code_postal", + NULLIF(TRIM(data ->> 'commune'), '') AS "commune", + NULLIF(TRIM(data ->> 'complement_adresse'), '') AS "complement_adresse", + NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel", + NULLIF(TRIM(data ->> 'horaires_ouverture'), '') AS "horaires_ouverture", + NULLIF(TRIM(data ->> 'id'), '') AS "id", + NULLIF(TRIM(data ->> 'lien_source'), '') AS "lien_source", + NULLIF(TRIM(data ->> 'nom'), '') AS "nom", + NULLIF(TRIM(data ->> 'presentation_detail'), '') AS "presentation_detail", + NULLIF(TRIM(data ->> 'presentation_resume'), '') AS "presentation_resume", + NULLIF(TRIM(data ->> 'rna'), '') AS "rna", + NULLIF(TRIM(data ->> 'siret'), '') AS "siret", + NULLIF(TRIM(data ->> 'site_web'), '') AS "site_web", + NULLIF(TRIM(data ->> 'source'), '') AS "source", + NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone", + NULLIF(TRIM(data ->> 'typologie'), '') AS "typologie" FROM source )