Skip to content

Commit

Permalink
feat(pipeline) : First version a DBT model compiling the stats
Browse files Browse the repository at this point in the history
First working version before we attemp going further (snapshotting,
improvements on the ODSPEP or other sources, values per column counts,
etc)

data-inclusion=# SELECT * FROM public_intermediate.int_quality__stats ORDER BY source, stream;
  date_day  |        source         |         stream         | count_raw | count_stg | count_int | count_marts | count_api | count_contacts | count_addresses
------------+-----------------------+------------------------+-----------+-----------+-----------+-------------+-----------+----------------+-----------------
 2024-08-02 | action_logement       | services               |        26 |        23 |      2760 |        2760 |      2760 |              0 |            2760
 2024-08-02 | action_logement       | structures             |       123 |       120 |       120 |         120 |       120 |              0 |             120
 2024-08-02 | agefiph               | services               |        31 |        31 |        27 |          27 |        27 |              0 |              27
 2024-08-02 | cd35                  | organisations          |      3545 |      3545 |      3545 |        3544 |      3540 |           2594 |            3422
 2024-08-02 | cd72                  | services               |       474 |       463 |       463 |         260 |         0 |              0 |               0
 2024-08-02 | cd72                  | structures             |       217 |       217 |       217 |         213 |       457 |             41 |             394
 2024-08-02 | data_inclusion        | services               |        47 |        44 |        44 |          44 |        44 |             17 |              21
 2024-08-02 | data_inclusion        | structures             |        22 |        19 |        19 |          19 |        19 |              0 |              19
 2024-08-02 | dora                  | services               |     17717 |     11707 |     11707 |       11036 |     11034 |           8430 |           10160
 2024-08-02 | dora                  | structures             |      8554 |      8554 |      8554 |        8545 |      8538 |           3260 |            8342
 2024-08-02 | emplois_de_linclusion | organisations          |      8589 |      8589 |     15824 |       15821 |     15821 |           7041 |           15712
 2024-08-02 | emplois_de_linclusion | siaes                  |      7235 |      7235 |     15824 |       15821 |     15821 |           7041 |           15712
 2024-08-02 | france_travail        | agences                |       888 |       888 |       888 |         888 |       886 |              0 |             886
 2024-08-02 | france_travail        | services               |        28 |        25 |     22200 |       22200 |     22150 |              0 |           22150
 2024-08-02 | mediation_numerique   | services               |     20798 |     19445 |     19445 |       19424 |     19417 |           8158 |           19417
 2024-08-02 | mediation_numerique   | structures             |     20798 |     19445 |     19445 |       19424 |     19417 |           8158 |           19417
 2024-08-02 | mes_aides             | aides                  |       690 |       690 |       960 |         947 |       916 |            217 |             914
 2024-08-02 | mes_aides             | garages                |       908 |       908 |       870 |         863 |       847 |            163 |             845
 2024-08-02 | monenfant             | creches                |     84597 |     13481 |     13481 |       13479 |     13466 |          11098 |           13466
 2024-08-02 | odspep                | DD009_RES_PARTENARIALE |     28262 |      6438 |      9001 |        8717 |      8618 |              0 |            6591
 2024-08-02 | reseau_alpha          | formations             |       388 |       388 |       388 |         388 |       297 |            239 |             269
 2024-08-02 | reseau_alpha          | structures             |       757 |       757 |       757 |         752 |       752 |            538 |             667
 2024-08-02 | soliguide             | lieux                  |     22028 |     22028 |     22028 |       22028 |     21980 |              0 |           21980
  • Loading branch information
vperron committed Aug 5, 2024
1 parent 67baddb commit 3600b92
Show file tree
Hide file tree
Showing 3 changed files with 285 additions and 0 deletions.
75 changes: 75 additions & 0 deletions pipeline/dbt/models/_sources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@ sources:

- name: data_inclusion
schema: data_inclusion
meta:
is_provider: true
tables:
- name: structures
description: Entered by the data.inclusion team.
meta:
kind: structure
- name: services
description: Entered by the data.inclusion team.
meta:
kind: service

- name: insee
schema: insee
Expand All @@ -26,15 +32,27 @@ sources:

- name: dora
schema: dora
meta:
is_provider: true
tables:
- name: structures
meta:
kind: structure
- name: services
meta:
kind: service

- name: france_travail
schema: france_travail
meta:
is_provider: true
tables:
- name: agences
meta:
kind: structure
- name: services
meta:
kind: service

- name: finess
schema: finess
Expand All @@ -54,9 +72,15 @@ sources:

- name: mes_aides
schema: mes_aides
meta:
is_provider: true
tables:
- name: garages
meta:
kind: structure
- name: aides
meta:
kind: service

- name: annuaire_du_service_public
schema: annuaire_du_service_public
Expand All @@ -65,29 +89,53 @@ sources:

- name: cd35
schema: cd35
meta:
is_provider: true
tables:
- name: organisations
meta:
kind: structure

- name: cd72
schema: cd72
meta:
is_provider: true
tables:
- name: structures
meta:
kind: structure
- name: services
meta:
kind: service

- name: emplois_de_linclusion
schema: emplois_de_linclusion
meta:
is_provider: true
tables:
- name: siaes
meta:
kind: structure
- name: organisations
meta:
kind: structure

- name: mediation_numerique
schema: mediation_numerique
meta:
is_provider: true
tables:
- name: structures
meta:
kind: structure
- name: services
meta:
kind: service

- name: odspep
schema: odspep
meta:
is_provider: true
tables:
- name: DD009_ACTIONs_DEMARCHES
- name: DD009_ADRESSE
Expand All @@ -110,27 +158,48 @@ sources:
- name: DD009_REGION_RESSOURCE_2
- name: DD009_REGION_SUGGESTION
- name: DD009_RES_PARTENARIALE
meta:
kind: service
staging_name: res_partenariales

- name: soliguide
schema: soliguide
meta:
is_provider: true
tables:
- name: lieux
meta:
kind: structure

- name: monenfant
schema: monenfant
meta:
is_provider: true
tables:
- name: creches
meta:
kind: structure

- name: agefiph
schema: agefiph
meta:
is_provider: true
tables:
- name: services
meta:
kind: service

- name: reseau_alpha
schema: reseau_alpha
meta:
is_provider: true
tables:
- name: structures
meta:
kind: structure
- name: formations
meta:
kind: service

- name: brevo
schema: brevo
Expand All @@ -144,6 +213,12 @@ sources:

- name: action_logement
schema: action_logement
meta:
is_provider: true
tables:
- name: structures
meta:
kind: structure
- name: services
meta:
kind: service
91 changes: 91 additions & 0 deletions pipeline/dbt/models/intermediate/quality/_quality_models.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
version: 2

models:
- name: int_quality__stats
data_tests:
- dbt_utils.unique_combination_of_columns:
combination_of_columns:
- source
- stream

columns:
- name: source
data_tests:
- not_null
- dbt_utils.not_constant
- accepted_values:
values:
- action_logement
- agefiph
- cd35
- cd72
- data_inclusion
- dora
- emplois_de_linclusion
- france_travail
- mediation_numerique
- mes_aides
- monenfant
- odspep
- reseau_alpha
- soliguide

- name: stream
data_tests:
- not_null
- dbt_utils.not_constant
- accepted_values:
values:
- agences
- aides
- creches
- DD009_RES_PARTENARIALE
- formations
- garages
- lieux
- organisations
- services
- siaes
- structures

- name: count_raw
data_tests:
- dbt_utils.accepted_range:
min_value: 0
inclusive: false

- name: count_stg
data_tests:
- dbt_utils.accepted_range:
min_value: 0
inclusive: false

- name: count_int
data_tests:
- dbt_utils.accepted_range:
min_value: 0
inclusive: false

- name: count_marts
data_tests:
- dbt_utils.accepted_range:
min_value: 0
inclusive: false

- name: count_api
data_tests:
- dbt_utils.accepted_range:
min_value: 0
inclusive: false

- name: count_contacts
data_tests:
- dbt_utils.accepted_range:
min_value: 0
inclusive: true

- name: count_addresses
data_tests:
- dbt_utils.accepted_range:
min_value: 0
inclusive: true
119 changes: 119 additions & 0 deletions pipeline/dbt/models/intermediate/quality/int_quality__stats.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/* Included : all the providers that do actually end up in the marts */

-- depends_on: {{ ref('stg_action_logement__services') }}
-- depends_on: {{ ref('stg_action_logement__structures') }}
-- depends_on: {{ ref('int_action_logement__services') }}
-- depends_on: {{ ref('int_action_logement__structures') }}
-- depends_on: {{ ref('stg_agefiph__services') }}
-- depends_on: {{ ref('int_agefiph__services') }}
-- depends_on: {{ ref('stg_cd35__organisations') }}
-- depends_on: {{ ref('int_cd35__structures') }}
-- depends_on: {{ ref('stg_cd72__services') }}
-- depends_on: {{ ref('stg_cd72__structures') }}
-- depends_on: {{ ref('int_cd72__services') }}
-- depends_on: {{ ref('int_cd72__structures') }}
-- depends_on: {{ ref('stg_data_inclusion__services') }}
-- depends_on: {{ ref('stg_data_inclusion__structures') }}
-- depends_on: {{ ref('int_data_inclusion__services') }}
-- depends_on: {{ ref('int_data_inclusion__structures') }}
-- depends_on: {{ ref('stg_dora__services') }}
-- depends_on: {{ ref('stg_dora__structures') }}
-- depends_on: {{ ref('int_dora__services') }}
-- depends_on: {{ ref('int_dora__structures') }}
-- depends_on: {{ ref('stg_emplois_de_linclusion__organisations') }}
-- depends_on: {{ ref('stg_emplois_de_linclusion__siaes') }}
-- depends_on: {{ ref('int_emplois_de_linclusion__structures') }}
-- depends_on: {{ ref('stg_france_travail__agences') }}
-- depends_on: {{ ref('stg_france_travail__services') }}
-- depends_on: {{ ref('int_france_travail__services') }}
-- depends_on: {{ ref('int_france_travail__structures') }}
-- depends_on: {{ ref('stg_mediation_numerique__services') }}
-- depends_on: {{ ref('stg_mediation_numerique__structures') }}
-- depends_on: {{ ref('int_mediation_numerique__services') }}
-- depends_on: {{ ref('int_mediation_numerique__structures') }}
-- depends_on: {{ ref('stg_mes_aides__aides') }}
-- depends_on: {{ ref('stg_mes_aides__garages') }}
-- depends_on: {{ ref('int_mes_aides__services') }}
-- depends_on: {{ ref('int_mes_aides__structures') }}
-- depends_on: {{ ref('stg_monenfant__creches') }}
-- depends_on: {{ ref('int_monenfant__structures') }}
-- depends_on: {{ ref('stg_odspep__res_partenariales') }}
-- depends_on: {{ ref('int_odspep__services') }}
-- depends_on: {{ ref('stg_reseau_alpha__formations') }}
-- depends_on: {{ ref('stg_reseau_alpha__structures') }}
-- depends_on: {{ ref('int_reseau_alpha__services') }}
-- depends_on: {{ ref('int_reseau_alpha__structures') }}
-- depends_on: {{ ref('stg_soliguide__lieux') }}
-- depends_on: {{ ref('int_soliguide__structures') }}
-- depends_on: {{ ref('marts_inclusion__services') }}
-- depends_on: {{ ref('marts_inclusion__structures') }}

WITH

{% for source_node in graph.sources.values() if source_node.source_meta.is_provider %}

{% if source_node.meta.kind %}

{% set source_name = source_node.source_name %}
{% set source_slug = source_name | replace("_", "-") %}
{% set stream_name = source_node.name %}
{% set staging_name = source_node.meta.staging_name or stream_name %}
{% set stream_kind = source_node.meta.kind ~ "s" %}

{{ source_name }}__{{ stream_name }}__tmp_marts AS (
SELECT * FROM {{ ref('marts_inclusion__' ~ stream_kind) }} WHERE source = '{{ source_slug }}'
),

{{ source_name }}__{{ stream_name }}__tmp_api AS (
SELECT * FROM public.api__{{ stream_kind }} WHERE source = '{{ source_slug }}'
),

{{ source_name }}__{{ stream_name }}__tmp_api_contacts AS (
SELECT * FROM {{ source_name }}__{{ stream_name }}__tmp_api
WHERE courriel IS NOT NULL AND telephone IS NOT NULL
),

{{ source_name }}__{{ stream_name }}__tmp_api_adresse AS (
SELECT * FROM {{ source_name }}__{{ stream_name }}__tmp_api
WHERE NOT (adresse IS NOT NULL AND code_insee IS NOT NULL)
),

{{ source_name }}__{{ stream_name }}__stats AS (
SELECT
'{{ run_started_at.strftime("%Y-%m-%d") }}' AS date_day,
'{{ source_name }}' AS source,
'{{ stream_name }}' AS stream, -- noqa: references.keywords
(SELECT COUNT(*) FROM {{ source(source_name, stream_name) }}) AS count_raw,
(SELECT COUNT(*) FROM {{ ref('stg_' ~ source_name ~ '__' ~ staging_name) }}) AS count_stg,
(SELECT COUNT(*) FROM {{ ref('int_' ~ source_name ~ '__' ~ stream_kind) }}) AS count_int,
(SELECT COUNT(*) FROM {{ source_name }}__{{ stream_name }}__tmp_marts) AS count_marts,
(SELECT COUNT(*) FROM {{ source_name }}__{{ stream_name }}__tmp_api) AS count_api,
(SELECT COUNT(*) FROM {{ source_name }}__{{ stream_name }}__tmp_api_contacts)AS count_contacts,
(SELECT COUNT(*) FROM {{ source_name }}__{{ stream_name }}__tmp_api_adresse) AS count_addresses
),

{% endif %}

{% endfor %}

final AS (

{% for source_node in graph.sources.values() if source_node.source_meta.is_provider %}

{% if source_node.meta.kind %}

{% set source_name = source_node.source_name %}
{% set stream_name = source_node.name %}

SELECT * FROM {{ source_name }}__{{ stream_name }}__stats
{% if not loop.last %}
UNION ALL
{% endif %}

{% endif %}

{% endfor %}

)

SELECT * FROM final

0 comments on commit 3600b92

Please sign in to comment.