From 305681145c94957d2e737af3d7b6708ce3f2ec02 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Tue, 29 Aug 2023 10:04:00 +0200 Subject: [PATCH 01/34] fix(dbt) --- pipeline/dbt/models/marts/api/api_structure.sql | 2 ++ pipeline/dbt/models/marts/opendata/opendata_services.sql | 4 ++-- pipeline/dbt/models/marts/opendata/opendata_structures.sql | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pipeline/dbt/models/marts/api/api_structure.sql b/pipeline/dbt/models/marts/api/api_structure.sql index ff8bf8da6..d5acb82ae 100644 --- a/pipeline/dbt/models/marts/api/api_structure.sql +++ b/pipeline/dbt/models/marts/api/api_structure.sql @@ -9,6 +9,8 @@ final AS ( relation_alias='structures', from=ref('int__union_structures__enhanced'), except=[ + '_di_sirene_date_fermeture', + '_di_sirene_etab_successeur', '_di_adresse_surrogate_id', '_di_annotated_antenne', '_di_annotated_siret', diff --git a/pipeline/dbt/models/marts/opendata/opendata_services.sql b/pipeline/dbt/models/marts/opendata/opendata_services.sql index 425f4db1a..f6b7b577b 100644 --- a/pipeline/dbt/models/marts/opendata/opendata_services.sql +++ b/pipeline/dbt/models/marts/opendata/opendata_services.sql @@ -1,5 +1,5 @@ WITH services AS ( - SELECT * FROM {{ ref('int__union_services') }} + SELECT * FROM {{ ref('int__union_services__enhanced') }} ), final AS ( @@ -7,7 +7,7 @@ final AS ( {{ dbt_utils.star( relation_alias='services', - from=ref('int__union_services'), + from=ref('int__union_services__enhanced'), except=['courriel', 'telephone']) }}, {{ obfuscate('courriel') }} AS "courriel", diff --git a/pipeline/dbt/models/marts/opendata/opendata_structures.sql b/pipeline/dbt/models/marts/opendata/opendata_structures.sql index 296c810a1..194b62662 100644 --- a/pipeline/dbt/models/marts/opendata/opendata_structures.sql +++ b/pipeline/dbt/models/marts/opendata/opendata_structures.sql @@ -1,5 +1,5 @@ WITH structures AS ( - SELECT * FROM {{ ref('int__union_structures') }} + SELECT * FROM {{ ref('int__union_structures__enhanced') }} ), final AS ( @@ -8,7 +8,7 @@ final AS ( {{ dbt_utils.star( relation_alias='structures', - from=ref('int__union_structures'), + from=ref('int__union_structures__enhanced'), except=['courriel', 'telephone']) }}, CASE From d582faee01e33996c57defa6e95cbbd6c6e1f9d3 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Tue, 29 Aug 2023 11:44:08 +0200 Subject: [PATCH 02/34] fix(dbt): tests --- .../models/intermediate/agefiph/int_agefiph__services.sql | 4 ++-- .../int_immersion_facilitee__adresses.sql | 6 +++--- .../int_mediation_numerique__services.sql | 8 ++++---- .../int_mediation_numerique__structures.sql | 2 +- .../models/intermediate/monenfant/_monenfant__models.yml | 4 ++-- .../models/intermediate/odspep/int_odspep__services.sql | 6 +++--- .../models/intermediate/odspep/int_odspep__structures.sql | 2 +- .../intermediate/soliguide/int_soliguide__services.sql | 6 +++--- .../intermediate/soliguide/int_soliguide__structures.sql | 2 +- .../dbt/models/staging/siao/stg_siao__etablissements.sql | 2 +- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql b/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql index daadc3aa2..308340b26 100644 --- a/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql +++ b/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql @@ -54,8 +54,6 @@ final AS ( TRUE AS "contact_public", NULL AS "contact_nom_prenom", structures.courriel AS "courriel", - NULL AS "cumulable", - NULL AS "date_suspension", NULL AS "formulaire_en_ligne", NULL AS "frais_autres", NULL AS "justificatifs", @@ -70,6 +68,8 @@ final AS ( regions."LIBELLE" AS "zone_diffusion_nom", 'region' AS "zone_diffusion_type", NULL AS "pre_requis", + CAST(NULL AS BOOLEAN) AS "cumulable", + CAST(NULL AS DATE) AS "date_suspension", 'https://www.agefiph.fr' || services.attributes__path__alias AS "lien_source", CAST(services.attributes__created AS DATE) AS "date_creation", CAST(services.attributes__changed AS DATE) AS "date_maj", diff --git a/pipeline/dbt/models/intermediate/immersion_facilitee/int_immersion_facilitee__adresses.sql b/pipeline/dbt/models/intermediate/immersion_facilitee/int_immersion_facilitee__adresses.sql index 978ed7a99..61f96f5e4 100644 --- a/pipeline/dbt/models/intermediate/immersion_facilitee/int_immersion_facilitee__adresses.sql +++ b/pipeline/dbt/models/intermediate/immersion_facilitee/int_immersion_facilitee__adresses.sql @@ -5,14 +5,14 @@ WITH structures AS ( final AS ( SELECT id AS "id", - NULL AS "longitude", - NULL AS "latitude", 'immersion-facilitee' AS "source", NULL AS "complement_adresse", city AS "commune", street_number_and_address AS "adresse", post_code AS "code_postal", - NULL AS "code_insee" + NULL AS "code_insee", + CAST(NULL AS FLOAT) AS "longitude", + CAST(NULL AS FLOAT) AS "latitude" FROM structures ) diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql index dd8b7c615..225f4026a 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql +++ b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql @@ -112,18 +112,18 @@ final AS ( services.thematiques AS "thematiques", services._di_source_id AS "source", NULL AS "pre_requis", - NULL AS "cumulable", + CAST(NULL AS BOOLEAN) AS "cumulable", NULL AS "justificatifs", NULL AS "formulaire_en_ligne", NULL AS "recurrence", - NULL AS "date_creation", - NULL AS "date_suspension", + CAST(NULL AS DATE) AS "date_creation", + CAST(NULL AS DATE) AS "date_suspension", NULL AS "lien_source", structures.telephone AS "telephone", structures.courriel AS "courriel", TRUE AS "contact_public", NULL AS "contact_nom_prenom", - structures.date_maj AS "date_maj", + CAST(structures.date_maj AS DATE) AS "date_maj", NULL AS "zone_diffusion_type", NULL AS "zone_diffusion_code", NULL AS "zone_diffusion_nom", diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__structures.sql b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__structures.sql index e21b877a7..1534e6bc3 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__structures.sql +++ b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__structures.sql @@ -58,7 +58,7 @@ final AS ( typologie AS "typologie", presentation_resume AS "presentation_resume", {{ truncate_text("presentation_detail") }} AS "presentation_detail", - date_maj AS "date_maj", + CAST(date_maj AS DATE) AS "date_maj", _di_source_id AS "source", labels_autres AS "labels_autres", CAST(NULL AS BOOLEAN) AS "antenne" diff --git a/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml b/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml index b16800599..c1dc583d7 100644 --- a/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml +++ b/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml @@ -9,12 +9,12 @@ models: - name: int_monenfant__structures tests: - - check_service: + - check_structure: config: severity: warn - name: int_monenfant__services tests: - - check_structure: + - check_service: config: severity: warn \ No newline at end of file diff --git a/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql b/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql index fd07fbc83..2c6885131 100644 --- a/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql +++ b/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql @@ -25,16 +25,16 @@ final AS ( NULL::TEXT [] AS "frais", NULL AS "frais_autres", NULL AS "pre_requis", - NULL AS "cumulable", + NULL::BOOLEAN AS "cumulable", NULL AS "justificatifs", NULL AS "formulaire_en_ligne", NULL AS "recurrence", - NULL AS "date_creation", + NULL::DATE AS "date_creation", date_fin_valid AS "date_suspension", NULL AS "lien_source", NULL AS "telephone", NULL AS "courriel", - NULL AS "contact_public", + NULL::BOOLEAN AS "contact_public", NULL AS "contact_nom_prenom", date_derniere_modif AS "date_maj", NULL::TEXT [] AS "modes_accueil", diff --git a/pipeline/dbt/models/intermediate/odspep/int_odspep__structures.sql b/pipeline/dbt/models/intermediate/odspep/int_odspep__structures.sql index 5c2f0dd8c..21410e3b5 100644 --- a/pipeline/dbt/models/intermediate/odspep/int_odspep__structures.sql +++ b/pipeline/dbt/models/intermediate/odspep/int_odspep__structures.sql @@ -8,7 +8,7 @@ final AS ( SELECT DISTINCT ON (1) id_res AS "id", id_res AS "adresse_id", - NULL AS "antenne", + NULL::BOOLEAN AS "antenne", NULL AS "rna", 'odspep' AS "source", NULL AS "horaires_ouverture", diff --git a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql index 8c70324c4..174845b09 100644 --- a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql +++ b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql @@ -118,11 +118,11 @@ final AS ( NULL AS "pre_requis", TRUE AS "cumulable", NULL AS "justificatifs", - NULL AS "date_creation", - NULL AS "date_suspension", + NULL::DATE AS "date_creation", + NULL::DATE AS "date_suspension", filtered_phones.phone_number AS "telephone", lieux.entity_mail AS "courriel", - NULL AS "contact_public", + NULL::BOOLEAN AS "contact_public", NULL AS "contact_nom_prenom", open_services.updated_at AS "date_maj", 'commune' AS "zone_diffusion_type", diff --git a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__structures.sql b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__structures.sql index 5249ff9db..f48e6123c 100644 --- a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__structures.sql +++ b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__structures.sql @@ -12,7 +12,7 @@ final AS ( SELECT lieux.lieu_id AS "id", lieux.lieu_id AS "adresse_id", - NULL AS "antenne", + NULL::BOOLEAN AS "antenne", NULL AS "rna", 'soliguide' AS "source", NULL AS "accessibilite", diff --git a/pipeline/dbt/models/staging/siao/stg_siao__etablissements.sql b/pipeline/dbt/models/staging/siao/stg_siao__etablissements.sql index 86e6e5c18..366343d86 100644 --- a/pipeline/dbt/models/staging/siao/stg_siao__etablissements.sql +++ b/pipeline/dbt/models/staging/siao/stg_siao__etablissements.sql @@ -7,7 +7,7 @@ final AS ( _di_source_id AS "_di_source_id", -- there is no proper index in the data, this is very problematic. -- for analytical use, annotate with the row number if the default ordering. - ROW_NUMBER() OVER () AS "id", + CAST(ROW_NUMBER() OVER () AS TEXT) AS "id", NULLIF(NULLIF(REGEXP_REPLACE(data ->> 'Code SIRET', '\D', '', 'g'), REPEAT('0', 14)), '') AS "code_siret", data ->> 'Nom de la structure' AS "nom_de_la_structure", data ->> 'Ville' AS "ville", From 84ea480f5a713f93923401c18948b94b429ec335 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 09:41:40 +0200 Subject: [PATCH 03/34] docs --- pipeline/CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline/CONTRIBUTING.md b/pipeline/CONTRIBUTING.md index 5d75d51dc..0a790e9cf 100644 --- a/pipeline/CONTRIBUTING.md +++ b/pipeline/CONTRIBUTING.md @@ -80,8 +80,8 @@ To update the constraints and upgrade the requirements: ```bash # optionally bump the airflow version -export AIRFLOW_VERSION= -export PYTHON_VERSION=3.10 +AIRFLOW_VERSION= +PYTHON_VERSION=3.10 curl https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt > requirements/airflow/constraints.txt pip-compile --resolver=backtracking --upgrade requirements/airflow/requirements.in --output-file requirements/airflow/requirements.txt ``` From acb5b51abb39e8566dd16137e8354b7ee68d8543 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 09:55:49 +0200 Subject: [PATCH 04/34] chore(pipeline): bump airflow --- pipeline/Dockerfile | 2 +- pipeline/requirements/airflow/base.in | 4 +- pipeline/requirements/airflow/constraints.txt | 759 +++++++++--------- .../requirements/airflow/requirements.txt | 226 ++++-- 4 files changed, 555 insertions(+), 436 deletions(-) diff --git a/pipeline/Dockerfile b/pipeline/Dockerfile index cc9c2649a..4206ba85e 100644 --- a/pipeline/Dockerfile +++ b/pipeline/Dockerfile @@ -41,7 +41,7 @@ RUN "${VIRTUAL_ENV}/bin/python" -m pip install --no-cache-dir -r requirements/ta ######## # This image is the runtime ######## -FROM apache/airflow:2.6.1-python3.10 +FROM apache/airflow:2.7.0-python3.10 ENV PYTHONUNBUFFERED 1 ENV PYTHONDONTWRITEBYTECODE 1 diff --git a/pipeline/requirements/airflow/base.in b/pipeline/requirements/airflow/base.in index 365d31148..302eb5ca6 100644 --- a/pipeline/requirements/airflow/base.in +++ b/pipeline/requirements/airflow/base.in @@ -1,2 +1,2 @@ -apache-airflow[amazon,postgres]==2.6.1 -psycopg2 \ No newline at end of file +apache-airflow[amazon,postgres]~=2.7.0 +psycopg2~=2.9.7 \ No newline at end of file diff --git a/pipeline/requirements/airflow/constraints.txt b/pipeline/requirements/airflow/constraints.txt index c5e06c2fe..a2dab6615 100644 --- a/pipeline/requirements/airflow/constraints.txt +++ b/pipeline/requirements/airflow/constraints.txt @@ -1,6 +1,6 @@ # -# This constraints file was automatically generated on 2023-05-15T11:02:06Z -# via "eager-upgrade" mechanism of PIP. For the "v2-6-test" branch of Airflow. +# This constraints file was automatically generated on 2023-08-18T14:48:29Z +# via "eager-upgrade" mechanism of PIP. For the "v2-7-test" branch of Airflow. # This variant of constraints install uses the HEAD of the branch version for 'apache-airflow' but installs # the providers from PIP-released packages at the moment of the constraint generation. # @@ -8,159 +8,192 @@ # We also use those constraints after "apache-airflow" is released and the constraints are tagged with # "constraints-X.Y.Z" tag to build the production image for that version. # -Authlib==1.2.0 +# This constraints file is meant to be used only in the "apache-airflow" installation command and not +# in all subsequent pip commands. By using a constraints.txt file, we ensure that solely the Airflow +# installation step is reproducible. Subsequent pip commands may install packages that would have +# been incompatible with the constraints used in Airflow reproducible installation step. Finally, pip +# commands that might change the installed version of apache-airflow should include "apache-airflow==X.Y.Z" +# in the list of install targets to prevent Airflow accidental upgrade or downgrade. +# +# Typical installation process of airflow for Python 3.8 is (with random selection of extras and custom +# dependencies added), usually consists of two steps: +# +# 1. Reproducible installation of airflow with selected providers (note constraints are used): +# +# pip install "apache-airflow[celery,cncf.kubernetes,google,amazon,snowflake]==X.Y.Z" \ +# --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-X.Y.Z/constraints-3.8.txt" +# +# 2. Installing own dependencies that are potentially not matching the constraints (note constraints are not +# used, and apache-airflow==X.Y.Z is used to make sure there is no accidental airflow upgrade/downgrade. +# +# pip install "apache-airflow==X.Y.Z" "snowflake-connector-python[pandas]==2.9.0" +# +Authlib==1.2.1 Babel==2.12.1 ConfigUpdater==3.1.1 -Deprecated==1.2.13 -Flask-AppBuilder==4.3.0 +Deprecated==1.2.14 +Flask-AppBuilder==4.3.3 Flask-Babel==2.0.0 Flask-Bcrypt==1.0.1 Flask-Caching==2.0.2 -Flask-JWT-Extended==4.4.4 +Flask-JWT-Extended==4.5.2 Flask-Limiter==3.3.1 Flask-Login==0.6.2 Flask-SQLAlchemy==2.5.1 Flask-Session==0.5.0 Flask-WTF==1.1.1 Flask==2.2.5 -GitPython==3.1.31 +GitPython==3.1.32 JPype1==1.4.1 JayDeBeApi==1.2.3 Jinja2==3.1.2 Mako==1.2.4 -Markdown==3.4.3 -MarkupSafe==2.1.2 -PyGithub==1.58.2 -PyHive==0.6.5 -PyJWT==2.7.0 +Markdown==3.4.4 +MarkupSafe==2.1.3 +PyGithub==1.59.1 +PyHive==0.7.0 +PyJWT==2.8.0 PyNaCl==1.5.0 -PyYAML==6.0 -Pygments==2.15.1 +PyYAML==6.0.1 +Pygments==2.16.1 SQLAlchemy-JSONField==1.0.1.post0 SQLAlchemy-Utils==0.41.1 -SQLAlchemy==1.4.48 +SQLAlchemy==1.4.49 SecretStorage==3.3.3 +Shapely==1.8.5.post1 Sphinx==5.3.0 -Unidecode==1.3.6 WTForms==3.0.1 Werkzeug==2.2.3 adal==1.2.7 -aiobotocore==2.5.0 -aiofiles==23.1.0 -aiohttp==3.8.4 +aiobotocore==2.5.4 +aiofiles==23.2.1 +aiohttp==3.8.5 aioitertools==0.11.0 aioresponses==0.7.4 aiosignal==1.3.1 alabaster==0.7.13 -alembic==1.10.4 +alembic==1.11.3 +alibabacloud-adb20211201==1.0.0 +alibabacloud-credentials==0.3.2 +alibabacloud-endpoint-util==0.0.3 +alibabacloud-gateway-spi==0.0.1 +alibabacloud-openapi-util==0.2.1 +alibabacloud-tea-openapi==0.3.7 +alibabacloud-tea-util==0.3.11 +alibabacloud-tea-xml==0.0.2 +alibabacloud-tea==0.3.3 aliyun-python-sdk-core==2.13.36 -aliyun-python-sdk-kms==2.16.0 +aliyun-python-sdk-kms==2.16.1 amqp==5.1.1 analytics-python==1.4.post1 ansiwrap==0.8.4 -anyio==3.6.2 -apache-airflow-providers-airbyte==3.2.1 -apache-airflow-providers-alibaba==2.3.0 -apache-airflow-providers-amazon==8.0.0 -apache-airflow-providers-apache-beam==5.0.0 -apache-airflow-providers-apache-cassandra==3.1.1 -apache-airflow-providers-apache-drill==2.3.2 -apache-airflow-providers-apache-druid==3.3.1 -apache-airflow-providers-apache-flink==1.0.1 -apache-airflow-providers-apache-hdfs==3.2.1 -apache-airflow-providers-apache-hive==6.0.0 -apache-airflow-providers-apache-impala==1.0.0 -apache-airflow-providers-apache-kylin==3.1.0 -apache-airflow-providers-apache-livy==3.4.0 -apache-airflow-providers-apache-pig==4.0.0 -apache-airflow-providers-apache-pinot==4.0.1 -apache-airflow-providers-apache-spark==4.0.1 -apache-airflow-providers-apache-sqoop==3.1.1 -apache-airflow-providers-arangodb==2.1.1 -apache-airflow-providers-asana==2.1.0 -apache-airflow-providers-atlassian-jira==2.0.1 -apache-airflow-providers-celery==3.1.0 -apache-airflow-providers-cloudant==3.1.0 -apache-airflow-providers-cncf-kubernetes==6.1.0 -apache-airflow-providers-common-sql==1.4.0 -apache-airflow-providers-databricks==4.1.0 -apache-airflow-providers-datadog==3.2.0 -apache-airflow-providers-dbt-cloud==3.1.1 -apache-airflow-providers-dingding==3.1.0 -apache-airflow-providers-discord==3.1.0 -apache-airflow-providers-docker==3.6.0 -apache-airflow-providers-elasticsearch==4.4.0 -apache-airflow-providers-exasol==4.1.3 -apache-airflow-providers-facebook==3.1.0 -apache-airflow-providers-ftp==3.3.1 -apache-airflow-providers-github==2.2.1 -apache-airflow-providers-google==10.0.0 -apache-airflow-providers-grpc==3.1.0 -apache-airflow-providers-hashicorp==3.3.1 -apache-airflow-providers-http==4.3.0 -apache-airflow-providers-imap==3.1.1 -apache-airflow-providers-influxdb==2.1.0 -apache-airflow-providers-jdbc==3.3.0 -apache-airflow-providers-jenkins==3.2.1 -apache-airflow-providers-microsoft-azure==6.0.0 -apache-airflow-providers-microsoft-mssql==3.3.2 -apache-airflow-providers-microsoft-psrp==2.2.0 -apache-airflow-providers-microsoft-winrm==3.1.1 -apache-airflow-providers-mongo==3.1.1 -apache-airflow-providers-mysql==5.0.0 -apache-airflow-providers-neo4j==3.2.1 -apache-airflow-providers-odbc==3.2.1 -apache-airflow-providers-openfaas==3.1.0 -apache-airflow-providers-opsgenie==5.0.0 -apache-airflow-providers-oracle==3.6.0 -apache-airflow-providers-pagerduty==3.1.0 -apache-airflow-providers-papermill==3.1.1 -apache-airflow-providers-plexus==3.1.0 -apache-airflow-providers-postgres==5.4.0 -apache-airflow-providers-presto==5.0.0 -apache-airflow-providers-qubole==3.3.1 -apache-airflow-providers-redis==3.1.0 -apache-airflow-providers-salesforce==5.3.0 -apache-airflow-providers-samba==4.1.0 -apache-airflow-providers-segment==3.1.0 -apache-airflow-providers-sendgrid==3.1.0 -apache-airflow-providers-sftp==4.2.4 -apache-airflow-providers-singularity==3.1.0 -apache-airflow-providers-slack==7.2.0 -apache-airflow-providers-smtp==1.0.1 -apache-airflow-providers-snowflake==4.0.5 -apache-airflow-providers-sqlite==3.3.2 -apache-airflow-providers-ssh==3.6.0 -apache-airflow-providers-tableau==4.1.0 -apache-airflow-providers-tabular==1.1.0 -apache-airflow-providers-telegram==4.0.0 -apache-airflow-providers-trino==5.0.0 -apache-airflow-providers-vertica==3.3.1 -apache-airflow-providers-yandex==3.3.0 -apache-airflow-providers-zendesk==4.2.0 -apache-beam==2.46.0 -apispec==5.2.2 +anyascii==0.3.2 +anyio==3.7.1 +apache-airflow-providers-airbyte==3.3.1 +apache-airflow-providers-alibaba==2.5.1 +apache-airflow-providers-amazon==8.5.1 +apache-airflow-providers-apache-beam==5.2.1 +apache-airflow-providers-apache-cassandra==3.2.1 +apache-airflow-providers-apache-drill==2.4.3 +apache-airflow-providers-apache-druid==3.5.0 +apache-airflow-providers-apache-flink==1.1.1 +apache-airflow-providers-apache-hdfs==4.1.0 +apache-airflow-providers-apache-hive==6.1.4 +apache-airflow-providers-apache-impala==1.1.2 +apache-airflow-providers-apache-kafka==1.1.2 +apache-airflow-providers-apache-kylin==3.2.1 +apache-airflow-providers-apache-livy==3.5.2 +apache-airflow-providers-apache-pig==4.1.1 +apache-airflow-providers-apache-pinot==4.1.2 +apache-airflow-providers-apache-spark==4.1.3 +apache-airflow-providers-apache-sqoop==4.0.0 +apache-airflow-providers-apprise==1.0.1 +apache-airflow-providers-arangodb==2.2.1 +apache-airflow-providers-asana==2.2.2 +apache-airflow-providers-atlassian-jira==2.1.1 +apache-airflow-providers-celery==3.3.2 +apache-airflow-providers-cloudant==3.2.1 +apache-airflow-providers-cncf-kubernetes==7.4.2 +apache-airflow-providers-common-sql==1.7.0 +apache-airflow-providers-daskexecutor==1.0.0 +apache-airflow-providers-databricks==4.3.3 +apache-airflow-providers-datadog==3.3.1 +apache-airflow-providers-dbt-cloud==3.2.2 +apache-airflow-providers-dingding==3.2.1 +apache-airflow-providers-discord==3.3.0 +apache-airflow-providers-docker==3.7.3 +apache-airflow-providers-elasticsearch==5.0.0 +apache-airflow-providers-exasol==4.2.3 +apache-airflow-providers-facebook==3.2.1 +apache-airflow-providers-ftp==3.5.0 +apache-airflow-providers-github==2.3.1 +apache-airflow-providers-google==10.6.0 +apache-airflow-providers-grpc==3.2.1 +apache-airflow-providers-hashicorp==3.4.2 +apache-airflow-providers-http==4.5.0 +apache-airflow-providers-imap==3.3.0 +apache-airflow-providers-influxdb==2.2.1 +apache-airflow-providers-jdbc==4.0.1 +apache-airflow-providers-jenkins==3.3.1 +apache-airflow-providers-microsoft-azure==6.2.4 +apache-airflow-providers-microsoft-mssql==3.4.2 +apache-airflow-providers-microsoft-psrp==2.3.1 +apache-airflow-providers-microsoft-winrm==3.2.1 +apache-airflow-providers-mongo==3.2.1 +apache-airflow-providers-mysql==5.2.1 +apache-airflow-providers-neo4j==3.3.2 +apache-airflow-providers-odbc==4.0.0 +apache-airflow-providers-openfaas==3.2.1 +apache-airflow-providers-openlineage==1.0.1 +apache-airflow-providers-opsgenie==5.1.1 +apache-airflow-providers-oracle==3.7.2 +apache-airflow-providers-pagerduty==3.3.0 +apache-airflow-providers-papermill==3.2.1 +apache-airflow-providers-plexus==3.2.1 +apache-airflow-providers-postgres==5.6.0 +apache-airflow-providers-presto==5.1.2 +apache-airflow-providers-qubole==3.4.2 +apache-airflow-providers-redis==3.3.1 +apache-airflow-providers-salesforce==5.4.1 +apache-airflow-providers-samba==4.2.1 +apache-airflow-providers-segment==3.2.1 +apache-airflow-providers-sendgrid==3.2.1 +apache-airflow-providers-sftp==4.5.0 +apache-airflow-providers-singularity==3.2.1 +apache-airflow-providers-slack==7.3.2 +apache-airflow-providers-smtp==1.3.0 +apache-airflow-providers-snowflake==4.4.2 +apache-airflow-providers-sqlite==3.4.3 +apache-airflow-providers-ssh==3.7.1 +apache-airflow-providers-tableau==4.2.1 +apache-airflow-providers-tabular==1.2.1 +apache-airflow-providers-telegram==4.1.1 +apache-airflow-providers-trino==5.2.1 +apache-airflow-providers-vertica==3.5.1 +apache-airflow-providers-zendesk==4.3.1 +apache-beam==2.49.0 +apispec==6.3.0 appdirs==1.4.4 -argcomplete==3.0.8 +apprise==1.4.5 +argcomplete==3.1.1 arrow==1.2.3 asana==3.2.1 -asgiref==3.6.0 +asgiref==3.7.2 asn1crypto==1.5.1 -astroid==2.15.5 +astroid==2.15.6 asttokens==2.2.1 -async-timeout==4.0.2 -asynctest==0.13.0 +async-timeout==4.0.3 atlasclient==1.0.0 -atlassian-python-api==3.36.0 +atlassian-python-api==3.41.0 attrs==23.1.0 -aws-sam-translator==1.66.0 +aws-sam-translator==1.73.0 aws-xray-sdk==2.12.0 -azure-batch==13.0.0 +azure-batch==14.0.0 azure-common==1.1.28 -azure-core==1.26.4 -azure-cosmos==4.3.1 +azure-core==1.29.2 +azure-cosmos==4.5.0 azure-datalake-store==0.0.53 -azure-identity==1.13.0 +azure-identity==1.14.0 azure-keyvault-secrets==4.7.0 azure-kusto-data==0.0.45 azure-mgmt-containerinstance==1.5.0 @@ -169,249 +202,256 @@ azure-mgmt-datafactory==1.1.0 azure-mgmt-datalake-nspkg==3.0.1 azure-mgmt-datalake-store==0.5.0 azure-mgmt-nspkg==3.0.2 -azure-mgmt-resource==23.0.0 +azure-mgmt-resource==23.0.1 azure-nspkg==3.0.2 -azure-servicebus==7.10.0 -azure-storage-blob==12.16.0 +azure-servicebus==7.11.1 +azure-storage-blob==12.17.0 azure-storage-common==2.1.0 -azure-storage-file-datalake==12.11.0 +azure-storage-file-datalake==12.12.0 azure-storage-file==2.1.0 azure-synapse-spark==0.7.0 backcall==0.2.0 backoff==1.10.0 bcrypt==4.0.1 beautifulsoup4==4.12.2 -billiard==3.6.4.0 -bitarray==2.7.3 -black==23.1a1 +billiard==4.1.0 +bitarray==2.8.1 +black==23.7.0 bleach==6.0.0 blinker==1.6.2 -boto3==1.26.76 +boto3==1.28.17 boto==2.49.0 -botocore==1.29.76 +botocore==1.31.17 bowler==0.9.0 cachelib==0.9.0 -cachetools==5.3.0 -cassandra-driver==3.27.0 -cattrs==22.2.0 -celery==5.2.7 -certifi==2023.5.7 +cachetools==5.3.1 +cassandra-driver==3.28.0 +cattrs==23.1.2 +celery==5.3.1 +certifi==2023.7.22 cffi==1.15.1 -cfgv==3.3.1 -cfn-lint==0.77.5 +cfgv==3.4.0 +cfn-lint==0.77.10 cgroupspy==0.2.2 -chardet==5.1.0 -charset-normalizer==2.1.1 +chardet==5.2.0 +charset-normalizer==3.2.0 checksumdir==1.2.0 ciso8601==2.3.0 -click-default-group==1.2.2 +click-default-group==1.2.4 click-didyoumean==0.3.0 click-plugins==1.1.1 -click-repl==0.2.0 -click==8.1.3 +click-repl==0.3.0 +click==8.1.7 clickclick==20.10.2 cloudant==2.15.0 cloudpickle==2.2.1 colorama==0.4.6 colorlog==4.8.0 +confluent-kafka==2.2.0 connexion==2.14.2 -coverage==7.2.5 +coverage==7.3.0 crcmod==1.7 -cron-descriptor==1.3.0 -croniter==1.3.14 -cryptography==40.0.2 +cron-descriptor==1.4.0 +croniter==1.4.1 +cryptography==41.0.3 curlify==2.2.1 -dask==2023.4.1 -databricks-sql-connector==2.5.2 -datadog==0.45.0 +dask==2023.8.0 +databricks-sql-connector==2.9.2 +datadog==0.46.0 db-dtypes==1.1.1 decorator==5.1.1 defusedxml==0.7.1 deprecation==2.1.0 dill==0.3.1.1 -distlib==0.3.6 -distributed==2023.4.1 -dnspython==2.3.0 -docker==6.1.2 +distlib==0.3.7 +distributed==2023.8.0 +dnspython==2.4.2 +docker==6.1.3 docopt==0.6.2 -docutils==0.20 +docutils==0.20.1 ecdsa==0.18.0 -elasticsearch-dbapi==0.2.10 -elasticsearch-dsl==7.4.1 -elasticsearch==7.13.4 +elasticsearch==7.14.2 email-validator==1.3.1 entrypoints==0.4 eralchemy2==1.3.7 et-xmlfile==1.1.0 eventlet==0.33.3 -exceptiongroup==1.1.1 -execnet==1.9.0 +exceptiongroup==1.1.3 +execnet==2.0.2 executing==1.2.0 -facebook-business==16.0.2 -fastavro==1.7.4 +facebook-business==17.0.4 +fastavro==1.8.2 fasteners==0.18 -fastjsonschema==2.16.3 -filelock==3.12.0 +fastjsonschema==2.18.0 +filelock==3.12.2 fissix==21.11.13 -flower==1.2.0 -frozenlist==1.3.3 -fsspec==2023.5.0 +flower==2.0.1 +frozenlist==1.4.0 +fsspec==2023.6.0 future==0.18.3 -gcloud-aio-auth==4.2.1 +gcloud-aio-auth==4.2.3 gcloud-aio-bigquery==6.3.0 -gcloud-aio-storage==8.2.0 -gcsfs==2023.5.0 +gcloud-aio-storage==8.3.0 +gcsfs==2023.6.0 geomet==0.2.1.post1 -gevent==22.10.2 +gevent==23.7.0 gitdb==4.0.10 -google-api-core==2.8.2 -google-api-python-client==1.12.11 +google-ads==21.3.0 +google-api-core==2.11.1 +google-api-python-client==2.97.0 google-auth-httplib2==0.1.0 -google-auth-oauthlib==0.8.0 -google-auth==2.18.0 -google-cloud-aiplatform==1.16.1 -google-cloud-appengine-logging==1.1.3 -google-cloud-audit-log==0.2.4 -google-cloud-automl==2.8.0 -google-cloud-bigquery-datatransfer==3.7.0 -google-cloud-bigquery-storage==2.14.1 -google-cloud-bigquery==2.34.4 -google-cloud-bigtable==2.11.1 -google-cloud-build==3.9.0 -google-cloud-compute==0.7.0 -google-cloud-container==2.11.1 -google-cloud-core==2.3.2 -google-cloud-datacatalog==3.9.0 -google-cloud-dataflow-client==0.5.4 -google-cloud-dataform==0.2.0 -google-cloud-dataplex==1.1.0 -google-cloud-dataproc-metastore==1.6.0 -google-cloud-dataproc==5.0.0 -google-cloud-dlp==3.8.0 -google-cloud-kms==2.12.0 -google-cloud-language==1.3.2 -google-cloud-logging==3.2.1 -google-cloud-memcache==1.4.1 -google-cloud-monitoring==2.11.0 -google-cloud-orchestration-airflow==1.4.1 -google-cloud-os-login==2.7.1 -google-cloud-pubsub==2.13.5 -google-cloud-redis==2.9.0 -google-cloud-resource-manager==1.6.0 -google-cloud-secret-manager==1.0.2 -google-cloud-spanner==1.19.3 -google-cloud-speech==1.3.4 -google-cloud-storage==2.9.0 -google-cloud-tasks==2.10.1 -google-cloud-texttospeech==1.0.3 -google-cloud-translate==1.7.2 -google-cloud-videointelligence==1.16.3 -google-cloud-vision==1.0.2 -google-cloud-workflows==1.7.1 +google-auth-oauthlib==1.0.0 +google-auth==2.22.0 +google-cloud-aiplatform==1.30.1 +google-cloud-appengine-logging==1.3.1 +google-cloud-audit-log==0.2.5 +google-cloud-automl==2.11.2 +google-cloud-bigquery-datatransfer==3.12.0 +google-cloud-bigquery-storage==2.22.0 +google-cloud-bigquery==3.11.4 +google-cloud-bigtable==2.21.0 +google-cloud-build==3.20.0 +google-cloud-compute==1.14.0 +google-cloud-container==2.30.0 +google-cloud-core==2.3.3 +google-cloud-datacatalog==3.15.0 +google-cloud-dataflow-client==0.8.4 +google-cloud-dataform==0.5.2 +google-cloud-dataplex==1.6.2 +google-cloud-dataproc-metastore==1.12.0 +google-cloud-dataproc==5.4.3 +google-cloud-dlp==3.12.2 +google-cloud-kms==2.19.1 +google-cloud-language==2.11.0 +google-cloud-logging==3.6.0 +google-cloud-memcache==1.7.2 +google-cloud-monitoring==2.15.1 +google-cloud-orchestration-airflow==1.9.1 +google-cloud-os-login==2.10.0 +google-cloud-pubsub==2.18.2 +google-cloud-redis==2.13.1 +google-cloud-resource-manager==1.10.3 +google-cloud-secret-manager==2.16.3 +google-cloud-spanner==3.40.1 +google-cloud-speech==2.21.0 +google-cloud-storage-transfer==1.9.1 +google-cloud-storage==2.10.0 +google-cloud-tasks==2.14.1 +google-cloud-texttospeech==2.14.1 +google-cloud-translate==3.12.0 +google-cloud-videointelligence==2.11.3 +google-cloud-vision==3.4.4 +google-cloud-workflows==1.11.0 google-crc32c==1.5.0 +google-re2==1.1 google-resumable-media==2.5.0 -googleapis-common-protos==1.56.4 +googleapis-common-protos==1.60.0 graphql-core==3.2.3 graphviz==0.20.1 greenlet==2.0.2 -grpc-google-iam-v1==0.12.4 +grpc-google-iam-v1==0.12.6 grpcio-gcp==0.2.2 -grpcio-status==1.48.2 -grpcio==1.54.2 +grpcio-status==1.57.0 +grpcio==1.57.0 gssapi==1.8.2 -gunicorn==20.1.0 +gunicorn==21.2.0 h11==0.14.0 -hdfs==2.7.0 +hdfs==2.7.2 hmsclient==0.1.1 httpcore==0.16.3 -httplib2==0.21.0 +httplib2==0.22.0 httpx==0.23.3 -humanize==4.6.0 -hvac==1.1.0 -identify==2.5.24 +humanize==4.8.0 +hvac==1.1.1 +identify==2.5.26 idna==3.4 -ijson==3.2.0.post0 +ijson==3.2.3 imagesize==1.4.1 -importlib-metadata==6.6.0 -importlib-resources==5.12.0 +importlib-metadata==6.8.0 +importlib-resources==6.0.1 impyla==0.18.0 incremental==22.10.0 inflection==0.5.1 -influxdb-client==1.36.1 +influxdb-client==1.37.0 iniconfig==2.0.0 ipdb==0.13.13 -ipython==8.13.2 +ipython==8.14.0 isodate==0.6.1 itsdangerous==2.1.2 -jaraco.classes==3.2.3 -jedi==0.18.2 +jaraco.classes==3.3.0 +jedi==0.19.0 jeepney==0.8.0 -jira==3.5.0 +jira==3.5.2 jmespath==0.10.0 jschema-to-python==1.2.3 json-merge-patch==0.2 jsondiff==2.0.0 -jsonpatch==1.32 +jsonpatch==1.33 jsonpath-ng==1.5.3 -jsonpickle==3.0.1 -jsonpointer==2.3 -jsonschema-spec==0.1.4 -jsonschema==4.17.3 +jsonpickle==3.0.2 +jsonpointer==2.4 +jsonschema-spec==0.2.4 +jsonschema-specifications==2023.7.1 +jsonschema==4.19.0 junit-xml==1.9 -jupyter_client==8.2.0 -jupyter_core==5.3.0 -keyring==23.13.1 -kombu==5.2.4 +jupyter_client==8.3.0 +jupyter_core==5.3.1 +keyring==24.2.0 +kombu==5.3.1 krb5==0.5.0 kubernetes-asyncio==24.2.3 kubernetes==23.6.0 kylinpy==2.8.4 lazy-object-proxy==1.9.0 ldap3==2.9.1 -limits==3.4.0 +limits==3.5.0 linkify-it-py==2.0.2 locket==1.0.0 lockfile==0.12.2 -looker-sdk==23.8.1 -lxml==4.9.2 +looker-sdk==23.14.1 +lxml==4.9.3 lz4==4.3.2 -markdown-it-py==2.2.0 -marshmallow-enum==1.5.1 +markdown-it-py==3.0.0 marshmallow-oneofschema==3.0.1 marshmallow-sqlalchemy==0.26.1 -marshmallow==3.19.0 +marshmallow==3.20.1 matplotlib-inline==0.1.6 -mdit-py-plugins==0.3.5 +mdit-py-plugins==0.4.0 mdurl==0.1.2 mongomock==4.1.2 monotonic==1.6 -more-itertools==9.1.0 +more-itertools==10.1.0 moreorless==0.4.0 -moto==4.1.9 +moto==4.1.14 mpmath==1.3.0 msal-extensions==1.0.0 -msal==1.22.0 +msal==1.23.0 msgpack==1.0.5 msrest==0.7.1 msrestazure==0.6.4 multi-key-dict==2.0.3 multidict==6.0.4 -mypy-boto3-appflow==1.26.125 -mypy-boto3-rds==1.26.132 -mypy-boto3-redshift-data==1.26.109 +mypy-boto3-appflow==1.28.16 +mypy-boto3-rds==1.28.19 +mypy-boto3-redshift-data==1.28.16 +mypy-boto3-s3==1.28.27 mypy-extensions==1.0.0 -mypy==1.0.0 -mysqlclient==2.1.1 -nbclient==0.7.4 -nbformat==5.8.0 -neo4j==5.8.0 +mypy==1.2.0 +mysql-connector-python==8.1.0 +mysqlclient==2.2.0 +nbclient==0.8.0 +nbformat==5.9.2 +neo4j==5.11.0 networkx==3.1 nodeenv==1.8.0 -numpy==1.24.3 +numpy==1.24.4 oauthlib==3.2.2 objsize==0.6.1 -openapi-schema-validator==0.4.4 -openapi-spec-validator==0.5.6 +openapi-schema-validator==0.6.0 +openapi-spec-validator==0.6.0 +openlineage-integration-common==1.0.0 +openlineage-python==1.0.0 +openlineage_sql==1.0.0 openpyxl==3.1.2 opentelemetry-api==1.15.0 opentelemetry-exporter-otlp-proto-grpc==1.15.0 @@ -422,83 +462,83 @@ opentelemetry-proto==1.15.0 opentelemetry-sdk==1.15.0 opentelemetry-semantic-conventions==0.36b0 opsgenie-sdk==2.1.5 -oracledb==1.3.1 +oracledb==1.4.0 ordered-set==4.1.0 -orjson==3.8.12 +orjson==3.9.5 oscrypto==1.3.0 -oss2==2.17.0 -packaging==21.3 -pandas-gbq==0.17.9 -pandas==1.5.3 +oss2==2.18.1 +packaging==23.1 +pandas-gbq==0.19.2 +pandas==2.0.3 papermill==2.4.0 -paramiko==3.1.0 +paramiko==3.3.1 parso==0.8.3 partd==1.4.0 pathable==0.4.3 -pathspec==0.9.0 +pathspec==0.11.2 pbr==5.11.1 -pdpyras==5.0.1 +pdpyras==5.1.1 pendulum==2.1.2 pexpect==4.8.0 pickleshare==0.7.5 -pinotdb==0.4.14 -pipdeptree==2.7.1 +pinotdb==0.5.0 +pipdeptree==2.13.0 pipx==1.2.0 pkginfo==1.9.6 -platformdirs==3.5.1 -pluggy==1.0.0 +platformdirs==3.8.1 +pluggy==1.2.0 ply==3.11 plyvel==1.5.0 portalocker==2.7.0 -pre-commit==3.3.1 +pre-commit==3.3.3 presto-python-client==0.8.3 prison==0.2.1 -prometheus-client==0.16.0 -prompt-toolkit==3.0.38 -proto-plus==1.19.6 -protobuf==3.20.0 +prometheus-client==0.17.1 +prompt-toolkit==3.0.39 +proto-plus==1.22.3 +protobuf==4.21.12 psutil==5.9.5 -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 ptyprocess==0.7.0 pure-eval==0.2.2 pure-sasl==0.6.2 -py-partiql-parser==0.3.0 +py-partiql-parser==0.3.6 py4j==0.10.9.7 -pyOpenSSL==23.1.1 -pyarrow==9.0.0 +pyOpenSSL==23.2.0 +pyarrow==11.0.0 pyasn1-modules==0.2.8 pyasn1==0.4.8 pycountry==22.3.5 pycparser==2.21 -pycryptodome==3.17 -pycryptodomex==3.17 -pydantic==1.10.7 -pydata-google-auth==1.8.0 +pycryptodome==3.18.0 +pycryptodomex==3.18.0 +pydantic==1.10.12 +pydata-google-auth==1.8.2 pydot==1.4.2 pydruid==0.6.5 pyenchant==3.2.2 pyexasol==0.25.2 -pygraphviz==1.10 +pygraphviz==1.11 pyhcl==0.4.4 pykerberos==1.2.4 -pymongo==3.13.0 -pymssql==2.2.7 +pymongo==4.4.1 +pymssql==2.2.8 pyodbc==4.0.39 -pyparsing==3.0.9 +pyparsing==3.1.1 pypsrp==0.8.1 -pyrsistent==0.19.3 -pyspark==3.4.0 -pyspnego==0.9.0 -pytest-asyncio==0.21.0 +pyspark==3.4.1 +pyspnego==0.9.1 +pytest-asyncio==0.21.1 pytest-capture-warnings==0.0.4 -pytest-cov==4.0.0 +pytest-cov==4.1.0 pytest-httpx==0.21.3 pytest-instafail==0.5.0 -pytest-rerunfailures==11.1.2 +pytest-mock==3.11.1 +pytest-rerunfailures==12.0 pytest-timeouts==1.2.1 -pytest-xdist==3.3.0 -pytest==7.3.1 -python-arango==7.5.7 +pytest-xdist==3.3.1 +pytest==7.4.0 +python-arango==7.6.0 python-daemon==3.0.1 python-dateutil==2.8.2 python-dotenv==1.0.0 @@ -512,70 +552,72 @@ python-telegram-bot==20.2 pytz==2023.3 pytzdata==2020.1 pywinrm==0.4.3 -pyzmq==25.0.2 +pyzmq==25.1.1 qds-sdk==1.16.1 reactivex==4.0.4 -readme-renderer==37.3 -redis==3.5.3 -redshift-connector==2.0.910 -regex==2023.5.5 +readme-renderer==40.0 +redis==5.0.0 +redshift-connector==2.0.913 +referencing==0.30.2 +regex==2023.8.8 requests-file==1.5.1 requests-kerberos==0.14.0 -requests-mock==1.10.0 +requests-mock==1.11.0 requests-ntlm==1.2.0 requests-oauthlib==1.3.1 requests-toolbelt==1.0.0 -requests==2.30.0 -responses==0.23.1 +requests==2.31.0 +responses==0.23.3 rfc3339-validator==0.1.4 rfc3986==1.5.0 +rich-argparse==1.2.0 rich-click==1.6.1 -rich==13.3.5 -rich_argparse==1.1.0 +rich==13.5.2 +rpds-py==0.9.2 rsa==4.9 -ruff==0.0.267 -s3transfer==0.6.1 +ruff==0.0.285 +s3transfer==0.6.2 sarif-om==1.0.4 sasl==0.3.1 scramp==1.4.4 scrapbook==0.5.0 -semver==3.0.0 +semver==3.0.1 sendgrid==6.10.0 sentinels==1.0.0 -sentry-sdk==1.22.2 +sentry-sdk==1.29.2 setproctitle==1.3.2 -simple-salesforce==1.12.3 +simple-salesforce==1.12.4 six==1.16.0 slack-sdk==3.21.3 smbprotocol==1.10.1 smmap==5.0.0 -snakebite-py3==3.0.5 sniffio==1.3.0 snowballstemmer==2.2.0 -snowflake-connector-python==3.0.3 +snowflake-connector-python==3.1.0 snowflake-sqlalchemy==1.4.7 sortedcontainers==2.4.0 soupsieve==2.4.1 sphinx-airflow-theme==0.0.12 sphinx-argparse==0.4.0 -sphinx-autoapi==2.1.0 +sphinx-autoapi==2.1.1 sphinx-copybutton==0.5.2 sphinx-jinja==2.0.2 -sphinx-rtd-theme==1.2.0 -sphinxcontrib-applehelp==1.0.4 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-htmlhelp==2.0.1 +sphinx-rtd-theme==1.2.2 +sphinxcontrib-applehelp==1.0.7 +sphinxcontrib-devhelp==1.0.5 +sphinxcontrib-htmlhelp==2.0.4 sphinxcontrib-httpdomain==1.8.1 sphinxcontrib-jquery==4.1 sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-qthelp==1.0.6 sphinxcontrib-redoc==1.6.0 -sphinxcontrib-serializinghtml==1.1.5 +sphinxcontrib-serializinghtml==1.1.8 sphinxcontrib-spelling==8.0.0 spython==0.3.0 -sqlalchemy-bigquery==1.6.1 +sqlalchemy-bigquery==1.8.0 sqlalchemy-drill==1.1.2 sqlalchemy-redshift==0.8.14 +sqlalchemy-spanner==1.6.2 sqlparse==0.4.4 sshpubkeys==3.3.1 sshtunnel==0.4.0 @@ -583,69 +625,70 @@ stack-data==0.6.2 starkbank-ecdsa==2.2.0 statsd==4.0.1 sympy==1.12 -tableauserverclient==0.24 +tableauserverclient==0.25 tabulate==0.9.0 -tblib==1.7.0 -tenacity==8.2.2 +tblib==2.0.0 +tenacity==8.2.3 termcolor==2.3.0 text-unidecode==1.3 textwrap3==0.9.2 thrift-sasl==0.4.3 thrift==0.16.0 -time-machine==2.9.0 +time-machine==2.12.0 tomli==2.0.1 +tomlkit==0.12.1 toolz==0.12.0 -tornado==6.3.2 -towncrier==22.12.0 -tqdm==4.65.0 +tornado==6.3.3 +towncrier==23.6.0 +tqdm==4.66.1 traitlets==5.9.0 -trino==0.323.0 +trino==0.326.0 twine==4.0.2 -types-Deprecated==1.2.9.2 -types-Markdown==3.4.2.9 -types-PyMySQL==1.0.19.7 -types-PyYAML==6.0.12.9 -types-boto==2.49.18.8 +types-Deprecated==1.2.9.3 +types-Markdown==3.4.2.10 +types-PyMySQL==1.1.0.1 +types-PyYAML==6.0.12.11 +types-boto==2.49.18.9 types-certifi==2021.10.8.3 -types-croniter==1.3.2.9 -types-docutils==0.20.0.1 -types-paramiko==3.0.0.10 -types-protobuf==4.23.0.1 -types-pyOpenSSL==23.1.0.3 -types-python-dateutil==2.8.19.13 -types-python-slugify==8.0.0.2 -types-pytz==2023.3.0.0 -types-redis==4.5.5.2 -types-requests==2.30.0.0 -types-setuptools==67.7.0.2 -types-tabulate==0.9.0.2 +types-croniter==1.4.0.1 +types-docutils==0.20.0.3 +types-paramiko==3.3.0.0 +types-protobuf==4.24.0.1 +types-pyOpenSSL==23.2.0.2 +types-python-dateutil==2.8.19.14 +types-python-slugify==8.0.0.3 +types-pytz==2023.3.0.1 +types-redis==4.6.0.4 +types-requests==2.31.0.2 +types-setuptools==68.1.0.0 +types-tabulate==0.9.0.3 types-termcolor==1.1.6.2 -types-toml==0.10.8.6 -types-urllib3==1.26.25.13 -typing_extensions==4.5.0 -tzlocal==5.0 +types-toml==0.10.8.7 +types-urllib3==1.26.25.14 +typing_extensions==4.7.1 +tzdata==2023.3 +tzlocal==5.0.1 uc-micro-py==1.0.2 unicodecsv==0.14.1 -uritemplate==3.0.1 -urllib3==1.26.15 -userpath==1.8.0 -vertica-python==1.3.2 +uritemplate==4.1.1 +urllib3==1.26.16 +userpath==1.9.0 +vertica-python==1.3.4 vine==5.0.0 -virtualenv==20.23.0 +virtualenv==20.24.1 volatile==2.1.0 watchtower==2.0.1 wcwidth==0.2.6 webencodings==0.5.1 -websocket-client==1.5.1 +websocket-client==1.6.1 wrapt==1.15.0 xmltodict==0.13.0 -yamllint==1.31.0 -yandexcloud==0.212.0 +yamllint==1.32.0 yarl==1.9.2 zeep==4.2.1 -zenpy==2.0.25 +zenpy==2.0.27 zict==3.0.0 -zipp==3.15.0 -zope.event==4.6 +zipp==3.16.2 +zope.event==5.0 zope.interface==6.0 zstandard==0.21.0 diff --git a/pipeline/requirements/airflow/requirements.txt b/pipeline/requirements/airflow/requirements.txt index febcce28b..bf01bdcac 100644 --- a/pipeline/requirements/airflow/requirements.txt +++ b/pipeline/requirements/airflow/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=requirements/airflow/requirements.txt --resolver=backtracking requirements/airflow/requirements.in # -aiohttp==3.8.4 +aiohttp==3.8.5 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-http @@ -12,59 +12,65 @@ aiosignal==1.3.1 # via # -c requirements/airflow/constraints.txt # aiohttp -alembic==1.10.4 +alembic==1.11.3 # via # -c requirements/airflow/constraints.txt # apache-airflow -anyio==3.6.2 +anyio==3.7.1 # via # -c requirements/airflow/constraints.txt # httpcore -apache-airflow[amazon,postgres]==2.6.1 +apache-airflow[amazon,postgres]==2.7.0 # via # -r requirements/airflow/base.in # apache-airflow-providers-amazon + # apache-airflow-providers-common-sql + # apache-airflow-providers-ftp + # apache-airflow-providers-http + # apache-airflow-providers-imap # apache-airflow-providers-postgres -apache-airflow-providers-amazon==8.0.0 + # apache-airflow-providers-sqlite +apache-airflow-providers-amazon==8.5.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-common-sql==1.4.0 +apache-airflow-providers-common-sql==1.7.0 # via # -c requirements/airflow/constraints.txt # apache-airflow # apache-airflow-providers-amazon # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-ftp==3.3.1 +apache-airflow-providers-ftp==3.5.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-http==4.3.0 +apache-airflow-providers-http==4.5.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-imap==3.1.1 + # apache-airflow-providers-amazon +apache-airflow-providers-imap==3.3.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-postgres==5.4.0 +apache-airflow-providers-postgres==5.6.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-sqlite==3.3.2 +apache-airflow-providers-sqlite==3.4.3 # via # -c requirements/airflow/constraints.txt # apache-airflow -apispec[yaml]==5.2.2 +apispec[yaml]==6.3.0 # via # -c requirements/airflow/constraints.txt # flask-appbuilder -argcomplete==3.0.8 +argcomplete==3.1.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -asgiref==3.6.0 +asgiref==3.7.2 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -74,7 +80,7 @@ asn1crypto==1.5.1 # via # -c requirements/airflow/constraints.txt # scramp -async-timeout==4.0.2 +async-timeout==4.0.3 # via # -c requirements/airflow/constraints.txt # aiohttp @@ -85,10 +91,16 @@ attrs==23.1.0 # apache-airflow # cattrs # jsonschema + # referencing babel==2.12.1 # via # -c requirements/airflow/constraints.txt # flask-babel +backoff==1.10.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http beautifulsoup4==4.12.2 # via # -c requirements/airflow/constraints.txt @@ -97,13 +109,13 @@ blinker==1.6.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -boto3==1.26.76 +boto3==1.28.17 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon # redshift-connector # watchtower -botocore==1.29.76 +botocore==1.31.17 # via # -c requirements/airflow/constraints.txt # boto3 @@ -114,11 +126,11 @@ cachelib==0.9.0 # -c requirements/airflow/constraints.txt # flask-caching # flask-session -cattrs==22.2.0 +cattrs==23.1.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -certifi==2023.5.7 +certifi==2023.7.22 # via # -c requirements/airflow/constraints.txt # httpcore @@ -128,12 +140,12 @@ cffi==1.15.1 # via # -c requirements/airflow/constraints.txt # cryptography -charset-normalizer==2.1.1 +charset-normalizer==3.2.0 # via # -c requirements/airflow/constraints.txt # aiohttp # requests -click==8.1.3 +click==8.1.7 # via # -c requirements/airflow/constraints.txt # clickclick @@ -159,15 +171,15 @@ connexion[flask]==2.14.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -cron-descriptor==1.3.0 +cron-descriptor==1.4.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -croniter==1.3.14 +croniter==1.4.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -cryptography==40.0.2 +cryptography==41.0.3 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -175,20 +187,21 @@ decorator==5.1.1 # via # -c requirements/airflow/constraints.txt # jsonpath-ng -deprecated==1.2.13 +deprecated==1.2.14 # via # -c requirements/airflow/constraints.txt # apache-airflow # limits + # opentelemetry-api dill==0.3.1.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -dnspython==2.3.0 +dnspython==2.4.2 # via # -c requirements/airflow/constraints.txt # email-validator -docutils==0.20 +docutils==0.20.1 # via # -c requirements/airflow/constraints.txt # python-daemon @@ -196,9 +209,10 @@ email-validator==1.3.1 # via # -c requirements/airflow/constraints.txt # flask-appbuilder -exceptiongroup==1.1.1 +exceptiongroup==1.1.3 # via # -c requirements/airflow/constraints.txt + # anyio # cattrs flask==2.2.5 # via @@ -214,7 +228,7 @@ flask==2.2.5 # flask-session # flask-sqlalchemy # flask-wtf -flask-appbuilder==4.3.0 +flask-appbuilder==4.3.3 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -226,7 +240,7 @@ flask-caching==2.0.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -flask-jwt-extended==4.4.4 +flask-jwt-extended==4.5.2 # via # -c requirements/airflow/constraints.txt # flask-appbuilder @@ -252,11 +266,20 @@ flask-wtf==1.1.1 # -c requirements/airflow/constraints.txt # apache-airflow # flask-appbuilder -frozenlist==1.3.3 +frozenlist==1.4.0 # via # -c requirements/airflow/constraints.txt # aiohttp # aiosignal +google-re2==1.1 + # via + # -c requirements/airflow/constraints.txt + # apache-airflow +googleapis-common-protos==1.60.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http graphviz==0.20.1 # via # -c requirements/airflow/constraints.txt @@ -265,7 +288,11 @@ greenlet==2.0.2 # via # -c requirements/airflow/constraints.txt # sqlalchemy -gunicorn==20.1.0 +grpcio==1.57.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc +gunicorn==21.2.0 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -289,7 +316,7 @@ idna==3.4 # requests # rfc3986 # yarl -importlib-resources==5.12.0 +importlib-resources==6.0.1 # via # -c requirements/airflow/constraints.txt # limits @@ -320,17 +347,21 @@ jsonpath-ng==1.5.3 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon -jsonschema==4.17.3 +jsonschema==4.19.0 # via # -c requirements/airflow/constraints.txt # apache-airflow # connexion # flask-appbuilder +jsonschema-specifications==2023.7.1 + # via + # -c requirements/airflow/constraints.txt + # jsonschema lazy-object-proxy==1.9.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -limits==3.4.0 +limits==3.5.0 # via # -c requirements/airflow/constraints.txt # flask-limiter @@ -343,7 +374,7 @@ lockfile==0.12.2 # -c requirements/airflow/constraints.txt # apache-airflow # python-daemon -lxml==4.9.2 +lxml==4.9.3 # via # -c requirements/airflow/constraints.txt # redshift-connector @@ -351,17 +382,17 @@ mako==1.2.4 # via # -c requirements/airflow/constraints.txt # alembic -markdown==3.4.3 +markdown==3.4.4 # via # -c requirements/airflow/constraints.txt # apache-airflow -markdown-it-py==2.2.0 +markdown-it-py==3.0.0 # via # -c requirements/airflow/constraints.txt # apache-airflow # mdit-py-plugins # rich -markupsafe==2.1.2 +markupsafe==2.1.3 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -369,17 +400,12 @@ markupsafe==2.1.2 # mako # werkzeug # wtforms -marshmallow==3.19.0 +marshmallow==3.20.1 # via # -c requirements/airflow/constraints.txt # flask-appbuilder - # marshmallow-enum # marshmallow-oneofschema # marshmallow-sqlalchemy -marshmallow-enum==1.5.1 - # via - # -c requirements/airflow/constraints.txt - # flask-appbuilder marshmallow-oneofschema==3.0.1 # via # -c requirements/airflow/constraints.txt @@ -388,7 +414,7 @@ marshmallow-sqlalchemy==0.26.1 # via # -c requirements/airflow/constraints.txt # flask-appbuilder -mdit-py-plugins==0.3.5 +mdit-py-plugins==0.4.0 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -401,32 +427,71 @@ multidict==6.0.4 # -c requirements/airflow/constraints.txt # aiohttp # yarl -mypy-boto3-appflow==1.26.125 +mypy-boto3-appflow==1.28.16 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon -mypy-boto3-rds==1.26.132 +mypy-boto3-rds==1.28.19 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon -mypy-boto3-redshift-data==1.26.109 +mypy-boto3-redshift-data==1.28.16 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon +mypy-boto3-s3==1.28.27 + # via + # -c requirements/airflow/constraints.txt + # apache-airflow-providers-amazon +opentelemetry-api==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # apache-airflow + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk +opentelemetry-exporter-otlp==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # apache-airflow +opentelemetry-exporter-otlp-proto-grpc==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp +opentelemetry-proto==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.36b0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-sdk ordered-set==4.1.0 # via # -c requirements/airflow/constraints.txt # flask-limiter -packaging==21.3 +packaging==23.1 # via # -c requirements/airflow/constraints.txt # apache-airflow + # apispec # connexion + # gunicorn # limits # marshmallow # redshift-connector # sqlalchemy-redshift -pathspec==0.9.0 +pathspec==0.11.2 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -434,7 +499,7 @@ pendulum==2.1.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -pluggy==1.0.0 +pluggy==1.2.0 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -446,13 +511,18 @@ prison==0.2.1 # via # -c requirements/airflow/constraints.txt # flask-appbuilder +protobuf==4.21.12 + # via + # -c requirements/airflow/constraints.txt + # googleapis-common-protos + # opentelemetry-proto psutil==5.9.5 # via # -c requirements/airflow/constraints.txt # apache-airflow -psycopg2==2.9.6 +psycopg2==2.9.7 # via -r requirements/airflow/base.in -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-postgres @@ -460,29 +530,21 @@ pycparser==2.21 # via # -c requirements/airflow/constraints.txt # cffi -pydantic==1.10.7 +pydantic==1.10.12 # via # -c requirements/airflow/constraints.txt # apache-airflow -pygments==2.15.1 +pygments==2.16.1 # via # -c requirements/airflow/constraints.txt # apache-airflow # rich -pyjwt==2.7.0 +pyjwt==2.8.0 # via # -c requirements/airflow/constraints.txt # apache-airflow # flask-appbuilder # flask-jwt-extended -pyparsing==3.0.9 - # via - # -c requirements/airflow/constraints.txt - # packaging -pyrsistent==0.19.3 - # via - # -c requirements/airflow/constraints.txt - # jsonschema python-daemon==3.0.1 # via # -c requirements/airflow/constraints.txt @@ -513,21 +575,27 @@ pytzdata==2020.1 # via # -c requirements/airflow/constraints.txt # pendulum -pyyaml==6.0 +pyyaml==6.0.1 # via # -c requirements/airflow/constraints.txt # apispec # clickclick # connexion -redshift-connector==2.0.910 +redshift-connector==2.0.913 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon -requests==2.30.0 +referencing==0.30.2 + # via + # -c requirements/airflow/constraints.txt + # jsonschema + # jsonschema-specifications +requests==2.31.0 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-http # connexion + # opentelemetry-exporter-otlp-proto-http # redshift-connector # requests-toolbelt requests-toolbelt==1.0.0 @@ -542,17 +610,22 @@ rfc3986[idna2008]==1.5.0 # via # -c requirements/airflow/constraints.txt # httpx -rich==13.3.5 +rich==13.5.2 # via # -c requirements/airflow/constraints.txt # apache-airflow # flask-limiter # rich-argparse -rich-argparse==1.1.0 +rich-argparse==1.2.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -s3transfer==0.6.1 +rpds-py==0.9.2 + # via + # -c requirements/airflow/constraints.txt + # jsonschema + # referencing +s3transfer==0.6.2 # via # -c requirements/airflow/constraints.txt # boto3 @@ -581,7 +654,7 @@ soupsieve==2.4.1 # via # -c requirements/airflow/constraints.txt # beautifulsoup4 -sqlalchemy==1.4.48 +sqlalchemy==1.4.49 # via # -c requirements/airflow/constraints.txt # alembic @@ -612,7 +685,7 @@ tabulate==0.9.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -tenacity==8.2.2 +tenacity==8.2.3 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -624,13 +697,16 @@ text-unidecode==1.3 # via # -c requirements/airflow/constraints.txt # python-slugify -typing-extensions==4.5.0 +typing-extensions==4.7.1 # via # -c requirements/airflow/constraints.txt # alembic # apache-airflow + # asgiref + # cattrs # flask-limiter # limits + # opentelemetry-sdk # pydantic uc-micro-py==1.0.2 # via @@ -640,7 +716,7 @@ unicodecsv==0.14.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -urllib3==1.26.15 +urllib3==1.26.16 # via # -c requirements/airflow/constraints.txt # botocore From 7cb5bb05110adb17fb71813384975957108e4f11 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 10:30:46 +0200 Subject: [PATCH 05/34] chore(pipeline): bump dbt --- .../requirements/tasks/dbt/requirements.in | 4 +- .../requirements/tasks/dbt/requirements.txt | 86 ++++++++++++------- 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/pipeline/requirements/tasks/dbt/requirements.in b/pipeline/requirements/tasks/dbt/requirements.in index ea174e9af..344fb97f4 100644 --- a/pipeline/requirements/tasks/dbt/requirements.in +++ b/pipeline/requirements/tasks/dbt/requirements.in @@ -1,2 +1,2 @@ -dbt-core -dbt-postgres \ No newline at end of file +dbt-core~=1.6.1 +dbt-postgres~=1.6.1 \ No newline at end of file diff --git a/pipeline/requirements/tasks/dbt/requirements.txt b/pipeline/requirements/tasks/dbt/requirements.txt index c2ef4427e..bd3798323 100644 --- a/pipeline/requirements/tasks/dbt/requirements.txt +++ b/pipeline/requirements/tasks/dbt/requirements.txt @@ -4,86 +4,102 @@ # # pip-compile --output-file=requirements/tasks/dbt/requirements.txt --resolver=backtracking requirements/tasks/dbt/requirements.in # -agate==1.7.0 - # via dbt-core +agate==1.7.1 + # via + # dbt-core + # dbt-postgres attrs==23.1.0 # via jsonschema babel==2.12.1 # via agate -certifi==2023.5.7 +certifi==2023.7.22 # via requests cffi==1.15.1 # via dbt-core -charset-normalizer==3.1.0 +charset-normalizer==3.2.0 # via requests -click==8.1.3 - # via dbt-core +click==8.1.7 + # via + # dbt-core + # dbt-semantic-interfaces colorama==0.4.6 # via dbt-core -dbt-core==1.5.1 +dbt-core==1.6.1 # via # -r requirements/tasks/dbt/requirements.in # dbt-postgres dbt-extractor==0.4.1 # via dbt-core -dbt-postgres==1.5.1 +dbt-postgres==1.6.1 # via -r requirements/tasks/dbt/requirements.in -future==0.18.3 - # via parsedatetime +dbt-semantic-interfaces==0.2.0 + # via dbt-core hologram==0.0.16 # via dbt-core idna==3.4 # via # dbt-core # requests +importlib-metadata==6.8.0 + # via dbt-semantic-interfaces isodate==0.6.1 # via # agate # dbt-core jinja2==3.1.2 - # via dbt-core -jsonschema==4.17.3 - # via hologram + # via + # dbt-core + # dbt-semantic-interfaces +jsonschema==3.2.0 + # via + # dbt-semantic-interfaces + # hologram leather==0.3.4 # via agate logbook==1.5.3 # via dbt-core markupsafe==2.1.3 - # via - # jinja2 - # werkzeug -mashumaro[msgpack]==3.6 + # via jinja2 +mashumaro[msgpack]==3.8.1 # via dbt-core minimal-snowplow-tracker==0.0.2 # via dbt-core +more-itertools==8.14.0 + # via dbt-semantic-interfaces msgpack==1.0.5 # via mashumaro -networkx==2.8.8 +networkx==3.1 # via dbt-core packaging==23.1 # via dbt-core -parsedatetime==2.4 +parsedatetime==2.6 # via agate -pathspec==0.11.1 +pathspec==0.11.2 # via dbt-core -protobuf==4.23.2 +protobuf==4.24.2 # via dbt-core -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 # via dbt-postgres pycparser==2.21 # via cffi +pydantic==1.10.12 + # via dbt-semantic-interfaces pyrsistent==0.19.3 # via jsonschema python-dateutil==2.8.2 - # via hologram + # via + # dbt-semantic-interfaces + # hologram python-slugify==8.0.1 # via agate pytimeparse==1.1.8 # via agate pytz==2023.3 # via dbt-core -pyyaml==6.0 - # via dbt-core +pyyaml==6.0.1 + # via + # dbt-core + # dbt-semantic-interfaces requests==2.31.0 # via # dbt-core @@ -91,18 +107,26 @@ requests==2.31.0 six==1.16.0 # via # isodate + # jsonschema # leather # minimal-snowplow-tracker # python-dateutil -sqlparse==0.4.3 +sqlparse==0.4.4 # via dbt-core text-unidecode==1.3 # via python-slugify -typing-extensions==4.6.3 +typing-extensions==4.7.1 # via # dbt-core + # dbt-semantic-interfaces # mashumaro -urllib3==2.0.3 - # via requests -werkzeug==2.3.6 - # via dbt-core + # pydantic +urllib3==1.26.16 + # via + # dbt-core + # requests +zipp==3.16.2 + # via importlib-metadata + +# The following packages are considered to be unsafe in a requirements file: +# setuptools From 2a31facb6702be8c34f29746ab19311a1043c7e0 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 10:43:25 +0200 Subject: [PATCH 06/34] chore(pipeline): bump pandas --- pipeline/requirements/tasks/python/requirements.in | 6 +++--- pipeline/requirements/tasks/python/requirements.txt | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pipeline/requirements/tasks/python/requirements.in b/pipeline/requirements/tasks/python/requirements.in index f3950fbcd..a3b98ad34 100644 --- a/pipeline/requirements/tasks/python/requirements.in +++ b/pipeline/requirements/tasks/python/requirements.in @@ -2,10 +2,10 @@ apache-airflow-providers-postgres apache-airflow-providers-amazon GeoAlchemy2 -geopandas +geopandas~=0.13.2 openpyxl!=3.1.1 -pandas -psycopg2 +pandas~=2.1.0 +psycopg2~=2.9.7 pyairtable requests SQLAlchemy diff --git a/pipeline/requirements/tasks/python/requirements.txt b/pipeline/requirements/tasks/python/requirements.txt index 8a630a21e..c606b9460 100644 --- a/pipeline/requirements/tasks/python/requirements.txt +++ b/pipeline/requirements/tasks/python/requirements.txt @@ -324,7 +324,7 @@ packaging==23.1 # marshmallow # redshift-connector # sqlalchemy-redshift -pandas==2.0.2 +pandas==2.1.0 # via # -r requirements/tasks/python/requirements.in # geopandas @@ -340,7 +340,7 @@ prison==0.2.1 # via flask-appbuilder psutil==5.9.5 # via apache-airflow -psycopg2==2.9.6 +psycopg2==2.9.7 # via -r requirements/tasks/python/requirements.in psycopg2-binary==2.9.6 # via apache-airflow-providers-postgres From a7654817cbf8445f70f6f1fb1d034a919be486d8 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 10:58:06 +0200 Subject: [PATCH 07/34] chore(pipeline): bump pyairtable --- .../requirements/tasks/python/requirements.in | 2 +- .../requirements/tasks/python/requirements.txt | 15 ++++++++++----- .../src/data_inclusion/scripts/tasks/mes_aides.py | 3 ++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pipeline/requirements/tasks/python/requirements.in b/pipeline/requirements/tasks/python/requirements.in index a3b98ad34..b2e92ef42 100644 --- a/pipeline/requirements/tasks/python/requirements.in +++ b/pipeline/requirements/tasks/python/requirements.in @@ -6,7 +6,7 @@ geopandas~=0.13.2 openpyxl!=3.1.1 pandas~=2.1.0 psycopg2~=2.9.7 -pyairtable +pyairtable~=2.1 requests SQLAlchemy tenacity diff --git a/pipeline/requirements/tasks/python/requirements.txt b/pipeline/requirements/tasks/python/requirements.txt index c606b9460..39c8a3c0d 100644 --- a/pipeline/requirements/tasks/python/requirements.txt +++ b/pipeline/requirements/tasks/python/requirements.txt @@ -190,7 +190,7 @@ frozenlist==1.3.3 # via # aiohttp # aiosignal -geoalchemy2==0.13.3 +geoalchemy2==0.14.1 # via -r requirements/tasks/python/requirements.in geopandas==0.13.2 # via -r requirements/tasks/python/requirements.in @@ -218,7 +218,9 @@ idna==3.4 importlib-resources==5.12.0 # via limits inflection==0.5.1 - # via connexion + # via + # connexion + # pyairtable itsdangerous==2.1.2 # via # apache-airflow @@ -344,12 +346,14 @@ psycopg2==2.9.7 # via -r requirements/tasks/python/requirements.in psycopg2-binary==2.9.6 # via apache-airflow-providers-postgres -pyairtable==1.5.0 +pyairtable==2.1.0.post1 # via -r requirements/tasks/python/requirements.in pycparser==2.21 # via cffi pydantic==1.10.9 - # via apache-airflow + # via + # apache-airflow + # pyairtable pygments==2.15.1 # via # apache-airflow @@ -439,7 +443,7 @@ sniffio==1.3.0 # httpx soupsieve==2.4.1 # via beautifulsoup4 -sqlalchemy==1.4.48 +sqlalchemy==1.4.49 # via # -r requirements/tasks/python/requirements.in # alembic @@ -483,6 +487,7 @@ typing-extensions==4.6.3 # cattrs # flask-limiter # limits + # pyairtable # pydantic tzdata==2023.3 # via pandas diff --git a/pipeline/src/data_inclusion/scripts/tasks/mes_aides.py b/pipeline/src/data_inclusion/scripts/tasks/mes_aides.py index bbb34ed85..c0c1e9022 100644 --- a/pipeline/src/data_inclusion/scripts/tasks/mes_aides.py +++ b/pipeline/src/data_inclusion/scripts/tasks/mes_aides.py @@ -10,7 +10,8 @@ def extract(url: str, token: str, **kwargs) -> bytes: base_id = url.split("/")[-3] table_name = url.split("/")[-2] - table = pyairtable.Table(api_key=token, base_id=base_id, table_name=table_name) + api = pyairtable.Api(api_key=token) + table = api.table(base_id=base_id, table_name=table_name) data = table.all() with io.StringIO() as buf: From d89456e199bfc3d05f48c6705e00c55af7ed1c45 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 11:04:45 +0200 Subject: [PATCH 08/34] chore(pipeline): bump other deps --- .../requirements/tasks/python/requirements.in | 12 +- .../tasks/python/requirements.txt | 182 ++++++++++++------ 2 files changed, 126 insertions(+), 68 deletions(-) diff --git a/pipeline/requirements/tasks/python/requirements.in b/pipeline/requirements/tasks/python/requirements.in index b2e92ef42..61c068304 100644 --- a/pipeline/requirements/tasks/python/requirements.in +++ b/pipeline/requirements/tasks/python/requirements.in @@ -3,14 +3,14 @@ apache-airflow-providers-amazon GeoAlchemy2 geopandas~=0.13.2 -openpyxl!=3.1.1 +openpyxl~=3.1.2 pandas~=2.1.0 psycopg2~=2.9.7 pyairtable~=2.1 -requests +requests~=2.31 SQLAlchemy -tenacity -tqdm -trafilatura +tenacity~=8.2 +tqdm~=4.66 +trafilatura~=1.6 urllib3 -xlsxwriter \ No newline at end of file +xlsxwriter~=3.1.2 \ No newline at end of file diff --git a/pipeline/requirements/tasks/python/requirements.txt b/pipeline/requirements/tasks/python/requirements.txt index 39c8a3c0d..32550f884 100644 --- a/pipeline/requirements/tasks/python/requirements.txt +++ b/pipeline/requirements/tasks/python/requirements.txt @@ -4,15 +4,15 @@ # # pip-compile --output-file=requirements/tasks/python/requirements.txt --resolver=backtracking requirements/tasks/python/requirements.in # -aiohttp==3.8.4 +aiohttp==3.8.5 # via apache-airflow-providers-http aiosignal==1.3.1 # via aiohttp -alembic==1.11.1 +alembic==1.11.3 # via apache-airflow -anyio==3.7.0 +anyio==4.0.0 # via httpcore -apache-airflow==2.6.1 +apache-airflow==2.7.0 # via # apache-airflow-providers-amazon # apache-airflow-providers-common-sql @@ -21,25 +21,27 @@ apache-airflow==2.6.1 # apache-airflow-providers-imap # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-amazon==8.1.0 +apache-airflow-providers-amazon==8.6.0 # via -r requirements/tasks/python/requirements.in -apache-airflow-providers-common-sql==1.5.1 +apache-airflow-providers-common-sql==1.7.1 # via # apache-airflow # apache-airflow-providers-amazon # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-ftp==3.4.1 +apache-airflow-providers-ftp==3.5.1 # via apache-airflow -apache-airflow-providers-http==4.4.1 - # via apache-airflow -apache-airflow-providers-imap==3.2.1 +apache-airflow-providers-http==4.5.1 + # via + # apache-airflow + # apache-airflow-providers-amazon +apache-airflow-providers-imap==3.3.1 # via apache-airflow -apache-airflow-providers-postgres==5.5.0 +apache-airflow-providers-postgres==5.6.0 # via -r requirements/tasks/python/requirements.in -apache-airflow-providers-sqlite==3.4.1 +apache-airflow-providers-sqlite==3.4.3 # via apache-airflow -apispec[yaml]==5.2.2 +apispec[yaml]==6.3.0 # via flask-appbuilder argcomplete==3.1.1 # via apache-airflow @@ -50,7 +52,7 @@ asgiref==3.7.2 # apache-airflow-providers-http asn1crypto==1.5.1 # via scramp -async-timeout==4.0.2 +async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via @@ -59,19 +61,27 @@ attrs==23.1.0 # cattrs # fiona # jsonschema + # referencing babel==2.12.1 # via flask-babel +backoff==2.2.1 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +backports-datetime-fromisoformat==2.0.0 + # via htmldate beautifulsoup4==4.12.2 # via redshift-connector blinker==1.6.2 # via apache-airflow -boto3==1.26.151 +boto3==1.28.38 # via # apache-airflow-providers-amazon # redshift-connector # watchtower -botocore==1.29.151 +botocore==1.31.38 # via + # apache-airflow-providers-amazon # boto3 # redshift-connector # s3transfer @@ -81,7 +91,7 @@ cachelib==0.9.0 # flask-session cattrs==23.1.2 # via apache-airflow -certifi==2023.5.7 +certifi==2023.7.22 # via # fiona # httpcore @@ -91,13 +101,13 @@ certifi==2023.5.7 # trafilatura cffi==1.15.1 # via cryptography -charset-normalizer==3.1.0 +charset-normalizer==3.2.0 # via # aiohttp # htmldate # requests # trafilatura -click==8.1.3 +click==8.1.7 # via # click-plugins # clickclick @@ -123,9 +133,9 @@ courlan==0.9.3 # via trafilatura cron-descriptor==1.4.0 # via apache-airflow -croniter==1.3.15 +croniter==1.4.1 # via apache-airflow -cryptography==41.0.1 +cryptography==41.0.3 # via apache-airflow dateparser==1.1.8 # via htmldate @@ -135,9 +145,10 @@ deprecated==1.2.14 # via # apache-airflow # limits -dill==0.3.6 + # opentelemetry-api +dill==0.3.7 # via apache-airflow -dnspython==2.3.0 +dnspython==2.4.2 # via email-validator docutils==0.20.1 # via python-daemon @@ -145,7 +156,7 @@ email-validator==1.3.1 # via flask-appbuilder et-xmlfile==1.1.0 # via openpyxl -exceptiongroup==1.1.1 +exceptiongroup==1.1.3 # via # anyio # cattrs @@ -164,7 +175,7 @@ flask==2.2.5 # flask-session # flask-sqlalchemy # flask-wtf -flask-appbuilder==4.3.0 +flask-appbuilder==4.3.3 # via apache-airflow flask-babel==2.0.0 # via flask-appbuilder @@ -172,7 +183,7 @@ flask-caching==2.0.2 # via apache-airflow flask-jwt-extended==4.5.2 # via flask-appbuilder -flask-limiter==3.3.1 +flask-limiter==3.5.0 # via flask-appbuilder flask-login==0.6.2 # via @@ -186,7 +197,7 @@ flask-wtf==1.1.1 # via # apache-airflow # flask-appbuilder -frozenlist==1.3.3 +frozenlist==1.4.0 # via # aiohttp # aiosignal @@ -194,17 +205,25 @@ geoalchemy2==0.14.1 # via -r requirements/tasks/python/requirements.in geopandas==0.13.2 # via -r requirements/tasks/python/requirements.in +google-re2==1.1 + # via apache-airflow +googleapis-common-protos==1.60.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http graphviz==0.20.1 # via apache-airflow greenlet==2.0.2 # via sqlalchemy -gunicorn==20.1.0 +grpcio==1.57.0 + # via opentelemetry-exporter-otlp-proto-grpc +gunicorn==21.2.0 # via apache-airflow h11==0.14.0 # via httpcore -htmldate==1.4.3 +htmldate==1.5.0 # via trafilatura -httpcore==0.17.2 +httpcore==0.17.3 # via httpx httpx==0.24.1 # via apache-airflow @@ -215,7 +234,7 @@ idna==3.4 # httpx # requests # yarl -importlib-resources==5.12.0 +importlib-resources==6.0.1 # via limits inflection==0.5.1 # via @@ -239,11 +258,13 @@ jmespath==1.0.1 # botocore jsonpath-ng==1.5.3 # via apache-airflow-providers-amazon -jsonschema==4.17.3 +jsonschema==4.19.0 # via # apache-airflow # connexion # flask-appbuilder +jsonschema-specifications==2023.7.1 + # via jsonschema justext==3.0.0 # via trafilatura langcodes==3.3.0 @@ -258,7 +279,7 @@ lockfile==0.12.2 # via # apache-airflow # python-daemon -lxml==4.9.2 +lxml==4.9.3 # via # htmldate # justext @@ -266,9 +287,9 @@ lxml==4.9.2 # trafilatura mako==1.2.4 # via alembic -markdown==3.4.3 +markdown==3.4.4 # via apache-airflow -markdown-it-py==2.2.0 +markdown-it-py==3.0.0 # via # apache-airflow # mdit-py-plugins @@ -280,14 +301,11 @@ markupsafe==2.1.3 # mako # werkzeug # wtforms -marshmallow==3.19.0 +marshmallow==3.20.1 # via # flask-appbuilder - # marshmallow-enum # marshmallow-oneofschema # marshmallow-sqlalchemy -marshmallow-enum==1.5.1 - # via flask-appbuilder marshmallow-oneofschema==3.0.1 # via apache-airflow marshmallow-sqlalchemy==0.26.1 @@ -300,28 +318,52 @@ multidict==6.0.4 # via # aiohttp # yarl -mypy-boto3-appflow==1.26.145 +mypy-boto3-appflow==1.28.38 # via apache-airflow-providers-amazon -mypy-boto3-rds==1.26.144 +mypy-boto3-rds==1.28.36 # via apache-airflow-providers-amazon -mypy-boto3-redshift-data==1.26.109 +mypy-boto3-redshift-data==1.28.36 # via apache-airflow-providers-amazon -mypy-boto3-s3==1.26.127 +mypy-boto3-s3==1.28.36 # via apache-airflow-providers-amazon -numpy==1.24.3 +numpy==1.25.2 # via # pandas # shapely openpyxl==3.1.2 # via -r requirements/tasks/python/requirements.in +opentelemetry-api==1.15.0 + # via + # apache-airflow + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk +opentelemetry-exporter-otlp==1.15.0 + # via apache-airflow +opentelemetry-exporter-otlp-proto-grpc==1.15.0 + # via opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.15.0 + # via opentelemetry-exporter-otlp +opentelemetry-proto==1.15.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.15.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.36b0 + # via opentelemetry-sdk ordered-set==4.1.0 # via flask-limiter packaging==23.1 # via # apache-airflow + # apispec # connexion # geoalchemy2 # geopandas + # gunicorn # limits # marshmallow # redshift-connector @@ -330,43 +372,45 @@ pandas==2.1.0 # via # -r requirements/tasks/python/requirements.in # geopandas -pathspec==0.9.0 +pathspec==0.11.2 # via apache-airflow pendulum==2.1.2 # via apache-airflow -pluggy==1.0.0 +pluggy==1.3.0 # via apache-airflow ply==3.11 # via jsonpath-ng prison==0.2.1 # via flask-appbuilder +protobuf==4.24.2 + # via + # googleapis-common-protos + # opentelemetry-proto psutil==5.9.5 # via apache-airflow psycopg2==2.9.7 # via -r requirements/tasks/python/requirements.in -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 # via apache-airflow-providers-postgres pyairtable==2.1.0.post1 # via -r requirements/tasks/python/requirements.in pycparser==2.21 # via cffi -pydantic==1.10.9 +pydantic==1.10.12 # via # apache-airflow # pyairtable -pygments==2.15.1 +pygments==2.16.1 # via # apache-airflow # rich -pyjwt==2.7.0 +pyjwt==2.8.0 # via # apache-airflow # flask-appbuilder # flask-jwt-extended -pyproj==3.5.0 +pyproj==3.6.0 # via geopandas -pyrsistent==0.19.3 - # via jsonschema python-daemon==3.0.1 # via apache-airflow python-dateutil==2.8.2 @@ -393,20 +437,25 @@ pytz==2023.3 # redshift-connector pytzdata==2020.1 # via pendulum -pyyaml==6.0 +pyyaml==6.0.1 # via # apispec # clickclick # connexion -redshift-connector==2.0.911 +redshift-connector==2.0.913 # via apache-airflow-providers-amazon -regex==2023.6.3 +referencing==0.30.2 + # via + # jsonschema + # jsonschema-specifications +regex==2023.8.8 # via dateparser requests==2.31.0 # via # -r requirements/tasks/python/requirements.in # apache-airflow-providers-http # connexion + # opentelemetry-exporter-otlp-proto-http # pyairtable # redshift-connector # requests-toolbelt @@ -414,14 +463,18 @@ requests-toolbelt==1.0.0 # via apache-airflow-providers-http rfc3339-validator==0.1.4 # via apache-airflow -rich==13.4.1 +rich==13.5.2 # via # apache-airflow # flask-limiter # rich-argparse -rich-argparse==1.1.1 +rich-argparse==1.3.0 # via apache-airflow -s3transfer==0.6.1 +rpds-py==0.10.0 + # via + # jsonschema + # referencing +s3transfer==0.6.2 # via boto3 scramp==1.4.4 # via redshift-connector @@ -465,7 +518,7 @@ sqlparse==0.4.4 # via apache-airflow-providers-common-sql tabulate==0.9.0 # via apache-airflow -tenacity==8.2.2 +tenacity==8.2.3 # via # -r requirements/tasks/python/requirements.in # apache-airflow @@ -475,11 +528,11 @@ text-unidecode==1.3 # via python-slugify tld==0.13 # via courlan -tqdm==4.65.0 +tqdm==4.66.1 # via -r requirements/tasks/python/requirements.in trafilatura==1.6.1 # via -r requirements/tasks/python/requirements.in -typing-extensions==4.6.3 +typing-extensions==4.7.1 # via # alembic # apache-airflow @@ -487,6 +540,11 @@ typing-extensions==4.6.3 # cattrs # flask-limiter # limits + # mypy-boto3-appflow + # mypy-boto3-rds + # mypy-boto3-redshift-data + # mypy-boto3-s3 + # opentelemetry-sdk # pyairtable # pydantic tzdata==2023.3 From 7c8053e67630f8e65bb85a6ef6321300590bce4b Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 11:06:07 +0200 Subject: [PATCH 09/34] docs --- pipeline/CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/CONTRIBUTING.md b/pipeline/CONTRIBUTING.md index 0a790e9cf..c6f893946 100644 --- a/pipeline/CONTRIBUTING.md +++ b/pipeline/CONTRIBUTING.md @@ -30,7 +30,7 @@ You can run dbt commands from your terminal. ```bash # install dbt -pipx install --include-deps dbt-postgres==1.4.5 +pipx install --include-deps dbt-postgres==1.6.1 # install extra dbt packages (e.g. dbt_utils) dbt deps From 2625606cf58b28c9d28b5dedce6099af2efd6fb8 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 11:07:30 +0200 Subject: [PATCH 10/34] chore(pipeline): bump pipx --- pipeline/requirements/tasks/pipx/requirements.in | 2 +- pipeline/requirements/tasks/pipx/requirements.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline/requirements/tasks/pipx/requirements.in b/pipeline/requirements/tasks/pipx/requirements.in index 868cf8b3f..89cd6a74c 100644 --- a/pipeline/requirements/tasks/pipx/requirements.in +++ b/pipeline/requirements/tasks/pipx/requirements.in @@ -1 +1 @@ -pipx \ No newline at end of file +pipx~=1.2 \ No newline at end of file diff --git a/pipeline/requirements/tasks/pipx/requirements.txt b/pipeline/requirements/tasks/pipx/requirements.txt index 20105e3b6..87a8d7b7c 100644 --- a/pipeline/requirements/tasks/pipx/requirements.txt +++ b/pipeline/requirements/tasks/pipx/requirements.txt @@ -6,11 +6,11 @@ # argcomplete==3.1.1 # via pipx -click==8.1.3 +click==8.1.7 # via userpath packaging==23.1 # via pipx pipx==1.2.0 # via -r requirements/tasks/pipx/requirements.in -userpath==1.8.0 +userpath==1.9.0 # via pipx From 2bdf1da3ecdbf8c5b73b6716fcb65af41ca4a7c4 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 12:33:00 +0200 Subject: [PATCH 11/34] tests(dbt): add baseline to all domain int models --- .../intermediate/agefiph/_agefiph__models.yml | 2 +- .../intermediate/cd35/_cd35__models.yml | 20 +++++++++- .../intermediate/cd72/_cd72__models.yml | 20 +++++++++- .../_data_inclusion__models.yml | 2 +- .../intermediate/dora/_dora__models.yml | 38 ++++++++++++++++++- .../_emplois_de_linclusion__models.yml | 20 +++++++++- .../intermediate/finess/_finess__models.yml | 20 +++++++++- .../_immersion_facilitee__models.yml | 20 +++++++++- .../_mediation_numerique_models.yml | 38 ++++++++++++++++++- .../mes_aides/_mes_aides__models.yml | 20 +++++++++- .../monenfant/_monenfant__models.yml | 38 ++++++++++++++++++- .../intermediate/odspep/_odspep__models.yml | 35 ++++++++++++++++- .../intermediate/siao/_siao__models.yml | 20 +++++++++- .../soliguide/_soliguide__models.yml | 31 +++++++++++++++ .../staging/dora/stg_dora__services.sql | 13 ++++++- 15 files changed, 322 insertions(+), 15 deletions(-) diff --git a/pipeline/dbt/models/intermediate/agefiph/_agefiph__models.yml b/pipeline/dbt/models/intermediate/agefiph/_agefiph__models.yml index 87c0ce873..8bb6471ef 100644 --- a/pipeline/dbt/models/intermediate/agefiph/_agefiph__models.yml +++ b/pipeline/dbt/models/intermediate/agefiph/_agefiph__models.yml @@ -53,4 +53,4 @@ models: - not_null - relationships: to: ref('int_agefiph__adresses') - field: id \ No newline at end of file + field: id diff --git a/pipeline/dbt/models/intermediate/cd35/_cd35__models.yml b/pipeline/dbt/models/intermediate/cd35/_cd35__models.yml index c50dda219..f80244d46 100644 --- a/pipeline/dbt/models/intermediate/cd35/_cd35__models.yml +++ b/pipeline/dbt/models/intermediate/cd35/_cd35__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_cd35__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_cd35__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml b/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml index 9ba61cf56..f5cf7c449 100644 --- a/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml +++ b/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_cd72__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_cd72__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/data_inclusion/_data_inclusion__models.yml b/pipeline/dbt/models/intermediate/data_inclusion/_data_inclusion__models.yml index 6ebd9ba16..e25fb6a0d 100644 --- a/pipeline/dbt/models/intermediate/data_inclusion/_data_inclusion__models.yml +++ b/pipeline/dbt/models/intermediate/data_inclusion/_data_inclusion__models.yml @@ -55,4 +55,4 @@ models: - not_null - relationships: to: ref('int_data_inclusion__adresses') - field: id \ No newline at end of file + field: id diff --git a/pipeline/dbt/models/intermediate/dora/_dora__models.yml b/pipeline/dbt/models/intermediate/dora/_dora__models.yml index d1c4eaa50..622db6004 100644 --- a/pipeline/dbt/models/intermediate/dora/_dora__models.yml +++ b/pipeline/dbt/models/intermediate/dora/_dora__models.yml @@ -6,15 +6,51 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_dora__services tests: - check_service: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_dora__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_dora__adresses') + field: id - name: int_dora__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_dora__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/emplois_de_linclusion/_emplois_de_linclusion__models.yml b/pipeline/dbt/models/intermediate/emplois_de_linclusion/_emplois_de_linclusion__models.yml index 4a4ae0c77..ee5636969 100644 --- a/pipeline/dbt/models/intermediate/emplois_de_linclusion/_emplois_de_linclusion__models.yml +++ b/pipeline/dbt/models/intermediate/emplois_de_linclusion/_emplois_de_linclusion__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_emplois_de_linclusion__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_emplois_de_linclusion__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/finess/_finess__models.yml b/pipeline/dbt/models/intermediate/finess/_finess__models.yml index 2e0357fe6..db981f1ec 100644 --- a/pipeline/dbt/models/intermediate/finess/_finess__models.yml +++ b/pipeline/dbt/models/intermediate/finess/_finess__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_finess__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_finess__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/immersion_facilitee/_immersion_facilitee__models.yml b/pipeline/dbt/models/intermediate/immersion_facilitee/_immersion_facilitee__models.yml index 180cbb11d..13bcf3bed 100644 --- a/pipeline/dbt/models/intermediate/immersion_facilitee/_immersion_facilitee__models.yml +++ b/pipeline/dbt/models/intermediate/immersion_facilitee/_immersion_facilitee__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_immersion_facilitee__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_immersion_facilitee__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml b/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml index 57cf1db5e..3b6b5966d 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml +++ b/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml @@ -6,15 +6,51 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_mediation_numerique__services tests: - check_service: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_mediation_numerique__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_mediation_numerique__adresses') + field: id - name: int_mediation_numerique__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_mediation_numerique__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/mes_aides/_mes_aides__models.yml b/pipeline/dbt/models/intermediate/mes_aides/_mes_aides__models.yml index d968bcc25..25e577d77 100644 --- a/pipeline/dbt/models/intermediate/mes_aides/_mes_aides__models.yml +++ b/pipeline/dbt/models/intermediate/mes_aides/_mes_aides__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_mes_aides__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_mes_aides__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml b/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml index c1dc583d7..03c007d3e 100644 --- a/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml +++ b/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml @@ -6,15 +6,51 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_monenfant__structures tests: - check_structure: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_monenfant__adresses') + field: id - name: int_monenfant__services tests: - check_service: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_monenfant__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_monenfant__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/odspep/_odspep__models.yml b/pipeline/dbt/models/intermediate/odspep/_odspep__models.yml index 64d10c05a..10c487fad 100644 --- a/pipeline/dbt/models/intermediate/odspep/_odspep__models.yml +++ b/pipeline/dbt/models/intermediate/odspep/_odspep__models.yml @@ -6,6 +6,12 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_odspep__services tests: @@ -15,14 +21,39 @@ models: columns: - name: id tests: - - not_null - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_odspep__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_odspep__adresses') + field: id - name: int_odspep__structures tests: - check_structure: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_odspep__adresses') + field: id - name: int_odspep__zones_diffusion description: This model aggregates all the *_RESSOURCE ODSPEP tables in the same table, aligned on a common set of columns @@ -43,4 +74,4 @@ models: - name: group_number description: an identifier for the group to quickly visualize groups tests: - - not_null \ No newline at end of file + - not_null diff --git a/pipeline/dbt/models/intermediate/siao/_siao__models.yml b/pipeline/dbt/models/intermediate/siao/_siao__models.yml index 0c79e4e50..1b340c95d 100644 --- a/pipeline/dbt/models/intermediate/siao/_siao__models.yml +++ b/pipeline/dbt/models/intermediate/siao/_siao__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_siao__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_siao__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/soliguide/_soliguide__models.yml b/pipeline/dbt/models/intermediate/soliguide/_soliguide__models.yml index b91e06336..e3e1c1790 100644 --- a/pipeline/dbt/models/intermediate/soliguide/_soliguide__models.yml +++ b/pipeline/dbt/models/intermediate/soliguide/_soliguide__models.yml @@ -6,12 +6,30 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_soliguide__structures tests: - check_structure: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_soliguide__adresses') + field: id - name: int_soliguide__services tests: @@ -23,3 +41,16 @@ models: tests: - unique - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_soliguide__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_soliguide__adresses') + field: id diff --git a/pipeline/dbt/models/staging/dora/stg_dora__services.sql b/pipeline/dbt/models/staging/dora/stg_dora__services.sql index 18da24908..4f5e8c3e4 100644 --- a/pipeline/dbt/models/staging/dora/stg_dora__services.sql +++ b/pipeline/dbt/models/staging/dora/stg_dora__services.sql @@ -2,7 +2,11 @@ WITH source AS ( SELECT * FROM {{ source('dora', 'services') }} ), -final AS ( +structures AS ( + SELECT * FROM {{ ref('stg_dora__structures') }} +), + +services AS ( SELECT _di_source_id AS "_di_source_id", (data ->> 'contact_public')::BOOLEAN AS "contact_public", @@ -43,6 +47,13 @@ final AS ( NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom", data ->> 'zone_diffusion_type' AS "zone_diffusion_type" FROM source +), + +-- dora removes suggested structures from its api, but does not remove the associated services +-- therefore filter these orphan services +final AS ( + SELECT services.* + FROM services INNER JOIN structures ON services.structure_id = structures.id ) SELECT * FROM final From d810574171bc33ef98c74490c737025ef0d1c4cc Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 12:48:45 +0200 Subject: [PATCH 12/34] chore(pipeline): bump dev deps --- .pre-commit-config.yaml | 6 +- .vscode/data-inclusion.code-workspace | 3 + pipeline/requirements/dev/requirements.in | 2 +- pipeline/requirements/dev/requirements.txt | 226 +++++++++++++-------- 4 files changed, 152 insertions(+), 85 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c8276c672..9be462f70 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: # api - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 23.7.0 hooks: - id: black name: api|black @@ -31,7 +31,7 @@ repos: # pipeline - repo: https://github.com/psf/black - rev: 22.10.0 + rev: 23.7.0 hooks: - id: black name: pipeline|black @@ -95,7 +95,7 @@ repos: files: ^siretisation exclude: ^siretisation/django(/.*)*/static/vendor - repo: https://github.com/psf/black - rev: 22.10.0 + rev: 23.7.0 hooks: - id: black name: siretisation|black diff --git a/.vscode/data-inclusion.code-workspace b/.vscode/data-inclusion.code-workspace index aba7014e6..5c2503da2 100644 --- a/.vscode/data-inclusion.code-workspace +++ b/.vscode/data-inclusion.code-workspace @@ -14,6 +14,9 @@ }, { "path": ".." + }, + { + "path": "../../dora-back" } ] } \ No newline at end of file diff --git a/pipeline/requirements/dev/requirements.in b/pipeline/requirements/dev/requirements.in index 27ae5a3e8..165b0fbb6 100644 --- a/pipeline/requirements/dev/requirements.in +++ b/pipeline/requirements/dev/requirements.in @@ -1,7 +1,7 @@ -r ../airflow/base.in -r ../tasks/python/requirements.in -pytest black pre-commit +pytest tox diff --git a/pipeline/requirements/dev/requirements.txt b/pipeline/requirements/dev/requirements.txt index 58bc698f3..bc03dd358 100644 --- a/pipeline/requirements/dev/requirements.txt +++ b/pipeline/requirements/dev/requirements.txt @@ -4,15 +4,15 @@ # # pip-compile --output-file=requirements/dev/requirements.txt --resolver=backtracking requirements/dev/requirements.in # -aiohttp==3.8.4 +aiohttp==3.8.5 # via apache-airflow-providers-http aiosignal==1.3.1 # via aiohttp -alembic==1.11.1 +alembic==1.11.3 # via apache-airflow -anyio==3.7.0 +anyio==4.0.0 # via httpcore -apache-airflow[amazon,postgres]==2.6.1 +apache-airflow[amazon,postgres]==2.7.0 # via # -r requirements/dev/../airflow/base.in # apache-airflow-providers-amazon @@ -22,29 +22,31 @@ apache-airflow[amazon,postgres]==2.6.1 # apache-airflow-providers-imap # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-amazon==8.1.0 +apache-airflow-providers-amazon==8.6.0 # via # -r requirements/dev/../tasks/python/requirements.in # apache-airflow -apache-airflow-providers-common-sql==1.5.1 +apache-airflow-providers-common-sql==1.7.1 # via # apache-airflow # apache-airflow-providers-amazon # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-ftp==3.4.1 +apache-airflow-providers-ftp==3.5.1 # via apache-airflow -apache-airflow-providers-http==4.4.1 - # via apache-airflow -apache-airflow-providers-imap==3.2.1 +apache-airflow-providers-http==4.5.1 + # via + # apache-airflow + # apache-airflow-providers-amazon +apache-airflow-providers-imap==3.3.1 # via apache-airflow -apache-airflow-providers-postgres==5.5.0 +apache-airflow-providers-postgres==5.6.0 # via # -r requirements/dev/../tasks/python/requirements.in # apache-airflow -apache-airflow-providers-sqlite==3.4.1 +apache-airflow-providers-sqlite==3.4.3 # via apache-airflow -apispec[yaml]==5.2.2 +apispec[yaml]==6.3.0 # via flask-appbuilder argcomplete==3.1.1 # via apache-airflow @@ -55,7 +57,7 @@ asgiref==3.7.2 # apache-airflow-providers-http asn1crypto==1.5.1 # via scramp -async-timeout==4.0.2 +async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via @@ -64,21 +66,29 @@ attrs==23.1.0 # cattrs # fiona # jsonschema + # referencing babel==2.12.1 # via flask-babel +backoff==2.2.1 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +backports-datetime-fromisoformat==2.0.0 + # via htmldate beautifulsoup4==4.12.2 # via redshift-connector -black==23.3.0 +black==23.7.0 # via -r requirements/dev/requirements.in blinker==1.6.2 # via apache-airflow -boto3==1.26.151 +boto3==1.28.38 # via # apache-airflow-providers-amazon # redshift-connector # watchtower -botocore==1.29.151 +botocore==1.31.38 # via + # apache-airflow-providers-amazon # boto3 # redshift-connector # s3transfer @@ -90,7 +100,7 @@ cachetools==5.3.1 # via tox cattrs==23.1.2 # via apache-airflow -certifi==2023.5.7 +certifi==2023.7.22 # via # fiona # httpcore @@ -100,17 +110,17 @@ certifi==2023.5.7 # trafilatura cffi==1.15.1 # via cryptography -cfgv==3.3.1 +cfgv==3.4.0 # via pre-commit -chardet==5.1.0 +chardet==5.2.0 # via tox -charset-normalizer==3.1.0 +charset-normalizer==3.2.0 # via # aiohttp # htmldate # requests # trafilatura -click==8.1.3 +click==8.1.7 # via # black # click-plugins @@ -139,9 +149,9 @@ courlan==0.9.3 # via trafilatura cron-descriptor==1.4.0 # via apache-airflow -croniter==1.3.15 +croniter==1.4.1 # via apache-airflow -cryptography==41.0.1 +cryptography==41.0.3 # via apache-airflow dateparser==1.1.8 # via htmldate @@ -151,11 +161,12 @@ deprecated==1.2.14 # via # apache-airflow # limits -dill==0.3.6 + # opentelemetry-api +dill==0.3.7 # via apache-airflow -distlib==0.3.6 +distlib==0.3.7 # via virtualenv -dnspython==2.3.0 +dnspython==2.4.2 # via email-validator docutils==0.20.1 # via python-daemon @@ -163,12 +174,12 @@ email-validator==1.3.1 # via flask-appbuilder et-xmlfile==1.1.0 # via openpyxl -exceptiongroup==1.1.1 +exceptiongroup==1.1.3 # via # anyio # cattrs # pytest -filelock==3.12.1 +filelock==3.12.3 # via # tox # virtualenv @@ -187,7 +198,7 @@ flask==2.2.5 # flask-session # flask-sqlalchemy # flask-wtf -flask-appbuilder==4.3.0 +flask-appbuilder==4.3.3 # via apache-airflow flask-babel==2.0.0 # via flask-appbuilder @@ -195,7 +206,7 @@ flask-caching==2.0.2 # via apache-airflow flask-jwt-extended==4.5.2 # via flask-appbuilder -flask-limiter==3.3.1 +flask-limiter==3.5.0 # via flask-appbuilder flask-login==0.6.2 # via @@ -209,29 +220,37 @@ flask-wtf==1.1.1 # via # apache-airflow # flask-appbuilder -frozenlist==1.3.3 +frozenlist==1.4.0 # via # aiohttp # aiosignal -geoalchemy2==0.13.3 +geoalchemy2==0.14.1 # via -r requirements/dev/../tasks/python/requirements.in geopandas==0.13.2 # via -r requirements/dev/../tasks/python/requirements.in +google-re2==1.1 + # via apache-airflow +googleapis-common-protos==1.60.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http graphviz==0.20.1 # via apache-airflow greenlet==2.0.2 # via sqlalchemy -gunicorn==20.1.0 +grpcio==1.57.0 + # via opentelemetry-exporter-otlp-proto-grpc +gunicorn==21.2.0 # via apache-airflow h11==0.14.0 # via httpcore -htmldate==1.4.3 +htmldate==1.5.0 # via trafilatura -httpcore==0.17.2 +httpcore==0.17.3 # via httpx httpx==0.24.1 # via apache-airflow -identify==2.5.24 +identify==2.5.27 # via pre-commit idna==3.4 # via @@ -240,10 +259,12 @@ idna==3.4 # httpx # requests # yarl -importlib-resources==5.12.0 +importlib-resources==6.0.1 # via limits inflection==0.5.1 - # via connexion + # via + # connexion + # pyairtable iniconfig==2.0.0 # via pytest itsdangerous==2.1.2 @@ -264,11 +285,13 @@ jmespath==1.0.1 # botocore jsonpath-ng==1.5.3 # via apache-airflow-providers-amazon -jsonschema==4.17.3 +jsonschema==4.19.0 # via # apache-airflow # connexion # flask-appbuilder +jsonschema-specifications==2023.7.1 + # via jsonschema justext==3.0.0 # via trafilatura langcodes==3.3.0 @@ -283,7 +306,7 @@ lockfile==0.12.2 # via # apache-airflow # python-daemon -lxml==4.9.2 +lxml==4.9.3 # via # htmldate # justext @@ -291,9 +314,9 @@ lxml==4.9.2 # trafilatura mako==1.2.4 # via alembic -markdown==3.4.3 +markdown==3.4.4 # via apache-airflow -markdown-it-py==2.2.0 +markdown-it-py==3.0.0 # via # apache-airflow # mdit-py-plugins @@ -305,14 +328,11 @@ markupsafe==2.1.3 # mako # werkzeug # wtforms -marshmallow==3.19.0 +marshmallow==3.20.1 # via # flask-appbuilder - # marshmallow-enum # marshmallow-oneofschema # marshmallow-sqlalchemy -marshmallow-enum==1.5.1 - # via flask-appbuilder marshmallow-oneofschema==3.0.1 # via apache-airflow marshmallow-sqlalchemy==0.26.1 @@ -325,33 +345,57 @@ multidict==6.0.4 # via # aiohttp # yarl -mypy-boto3-appflow==1.26.145 +mypy-boto3-appflow==1.28.38 # via apache-airflow-providers-amazon -mypy-boto3-rds==1.26.144 +mypy-boto3-rds==1.28.36 # via apache-airflow-providers-amazon -mypy-boto3-redshift-data==1.26.109 +mypy-boto3-redshift-data==1.28.36 # via apache-airflow-providers-amazon -mypy-boto3-s3==1.26.127 +mypy-boto3-s3==1.28.36 # via apache-airflow-providers-amazon mypy-extensions==1.0.0 # via black nodeenv==1.8.0 # via pre-commit -numpy==1.24.3 +numpy==1.25.2 # via # pandas # shapely openpyxl==3.1.2 # via -r requirements/dev/../tasks/python/requirements.in +opentelemetry-api==1.15.0 + # via + # apache-airflow + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk +opentelemetry-exporter-otlp==1.15.0 + # via apache-airflow +opentelemetry-exporter-otlp-proto-grpc==1.15.0 + # via opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.15.0 + # via opentelemetry-exporter-otlp +opentelemetry-proto==1.15.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.15.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.36b0 + # via opentelemetry-sdk ordered-set==4.1.0 # via flask-limiter packaging==23.1 # via # apache-airflow + # apispec # black # connexion # geoalchemy2 # geopandas + # gunicorn # limits # marshmallow # pyproject-api @@ -359,62 +403,66 @@ packaging==23.1 # redshift-connector # sqlalchemy-redshift # tox -pandas==2.0.2 +pandas==2.1.0 # via # -r requirements/dev/../tasks/python/requirements.in # geopandas -pathspec==0.9.0 +pathspec==0.11.2 # via # apache-airflow # black pendulum==2.1.2 # via apache-airflow -platformdirs==3.5.3 +platformdirs==3.10.0 # via # black # tox # virtualenv -pluggy==1.0.0 +pluggy==1.3.0 # via # apache-airflow # pytest # tox ply==3.11 # via jsonpath-ng -pre-commit==3.3.2 +pre-commit==3.3.3 # via -r requirements/dev/requirements.in prison==0.2.1 # via flask-appbuilder +protobuf==4.24.2 + # via + # googleapis-common-protos + # opentelemetry-proto psutil==5.9.5 # via apache-airflow -psycopg2==2.9.6 +psycopg2==2.9.7 # via # -r requirements/dev/../airflow/base.in # -r requirements/dev/../tasks/python/requirements.in -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 # via apache-airflow-providers-postgres -pyairtable==1.5.0 +pyairtable==2.1.0.post1 # via -r requirements/dev/../tasks/python/requirements.in pycparser==2.21 # via cffi -pydantic==1.10.9 - # via apache-airflow -pygments==2.15.1 +pydantic==1.10.12 + # via + # apache-airflow + # pyairtable +pygments==2.16.1 # via # apache-airflow # rich -pyjwt==2.7.0 +pyjwt==2.8.0 # via # apache-airflow # flask-appbuilder # flask-jwt-extended -pyproj==3.5.0 +pyproj==3.6.0 # via geopandas -pyproject-api==1.5.1 +pyproject-api==1.6.1 # via tox -pyrsistent==0.19.3 - # via jsonschema -pytest==7.3.2 +pytest==7.4.0 # via -r requirements/dev/requirements.in python-daemon==3.0.1 # via apache-airflow @@ -442,21 +490,26 @@ pytz==2023.3 # redshift-connector pytzdata==2020.1 # via pendulum -pyyaml==6.0 +pyyaml==6.0.1 # via # apispec # clickclick # connexion # pre-commit -redshift-connector==2.0.911 +redshift-connector==2.0.913 # via apache-airflow-providers-amazon -regex==2023.6.3 +referencing==0.30.2 + # via + # jsonschema + # jsonschema-specifications +regex==2023.8.8 # via dateparser requests==2.31.0 # via # -r requirements/dev/../tasks/python/requirements.in # apache-airflow-providers-http # connexion + # opentelemetry-exporter-otlp-proto-http # pyairtable # redshift-connector # requests-toolbelt @@ -464,14 +517,18 @@ requests-toolbelt==1.0.0 # via apache-airflow-providers-http rfc3339-validator==0.1.4 # via apache-airflow -rich==13.4.1 +rich==13.5.2 # via # apache-airflow # flask-limiter # rich-argparse -rich-argparse==1.1.1 +rich-argparse==1.3.0 # via apache-airflow -s3transfer==0.6.1 +rpds-py==0.10.0 + # via + # jsonschema + # referencing +s3transfer==0.6.2 # via boto3 scramp==1.4.4 # via redshift-connector @@ -493,7 +550,7 @@ sniffio==1.3.0 # httpx soupsieve==2.4.1 # via beautifulsoup4 -sqlalchemy==1.4.48 +sqlalchemy==1.4.49 # via # -r requirements/dev/../tasks/python/requirements.in # alembic @@ -515,7 +572,7 @@ sqlparse==0.4.4 # via apache-airflow-providers-common-sql tabulate==0.9.0 # via apache-airflow -tenacity==8.2.2 +tenacity==8.2.3 # via # -r requirements/dev/../tasks/python/requirements.in # apache-airflow @@ -531,20 +588,27 @@ tomli==2.0.1 # pyproject-api # pytest # tox -tox==4.6.0 +tox==4.11.0 # via -r requirements/dev/requirements.in -tqdm==4.65.0 +tqdm==4.66.1 # via -r requirements/dev/../tasks/python/requirements.in trafilatura==1.6.1 # via -r requirements/dev/../tasks/python/requirements.in -typing-extensions==4.6.3 +typing-extensions==4.7.1 # via # alembic # apache-airflow # asgiref # cattrs + # filelock # flask-limiter # limits + # mypy-boto3-appflow + # mypy-boto3-rds + # mypy-boto3-redshift-data + # mypy-boto3-s3 + # opentelemetry-sdk + # pyairtable # pydantic tzdata==2023.3 # via pandas @@ -563,7 +627,7 @@ urllib3==1.26.16 # pyairtable # requests # trafilatura -virtualenv==20.23.0 +virtualenv==20.24.4 # via # pre-commit # tox From 6c8302edd70faa4802102514175719eff91e81e6 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 12:52:20 +0200 Subject: [PATCH 13/34] chore(qa): enforce precommits --- pipeline/src/data_inclusion/scripts/tasks/soliguide.py | 1 - siretisation/django/annotation/migrations/0001_initial.py | 1 - ...02_annotation_closed_annotation_irrelevant_and_more.py | 1 - .../annotation/migrations/0003_annotation_created_by.py | 1 - .../django/annotation/migrations/0004_dataset_slug.py | 1 - .../django/annotation/migrations/0005_enable_unaccent.py | 1 - .../annotation/migrations/0006_ds_priority_settings.py | 1 - .../migrations/0007_datasetrow_similar_address.py | 1 - .../0008_dataset_show_nearby_cnfs_permanences_and_more.py | 1 - siretisation/django/annotation/migrations/0009_source.py | 1 - .../annotation/migrations/0010_data_from_warehouse.py | 1 - .../django/cnfs/migrations/0001_add_cnfs_permanences.py | 1 - siretisation/django/matching/migrations/0001_initial.py | 8 ++++---- .../matching/migrations/0002_add_no_matching_row_flag.py | 1 - siretisation/django/sirene/migrations/0001_initial.py | 1 - ...02_remove_establishment_full_text_trgm_idx_and_more.py | 1 - siretisation/django/sirene/migrations/0003_add_postgis.py | 1 - .../django/sirene/migrations/0004_add_codenaf_table.py | 1 - .../django/sirene/migrations/0005_import_codenaf.py | 1 - .../sirene/migrations/0006_alter_establishment_ape.py | 1 - siretisation/django/users/migrations/0001_initial.py | 5 ++--- 21 files changed, 6 insertions(+), 26 deletions(-) diff --git a/pipeline/src/data_inclusion/scripts/tasks/soliguide.py b/pipeline/src/data_inclusion/scripts/tasks/soliguide.py index af3b2150a..029eb3046 100644 --- a/pipeline/src/data_inclusion/scripts/tasks/soliguide.py +++ b/pipeline/src/data_inclusion/scripts/tasks/soliguide.py @@ -110,7 +110,6 @@ def html_to_markdown(s: Optional[str]) -> Optional[str]: def read(path: Path) -> pd.DataFrame: - # utils.read_json is enough # but this adds the conversion of descriptions from html to markdown # should eventually be implemented as a python dbt model diff --git a/siretisation/django/annotation/migrations/0001_initial.py b/siretisation/django/annotation/migrations/0001_initial.py index a020dbc5f..da887ce09 100644 --- a/siretisation/django/annotation/migrations/0001_initial.py +++ b/siretisation/django/annotation/migrations/0001_initial.py @@ -7,7 +7,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [ diff --git a/siretisation/django/annotation/migrations/0002_annotation_closed_annotation_irrelevant_and_more.py b/siretisation/django/annotation/migrations/0002_annotation_closed_annotation_irrelevant_and_more.py index 789ca3c0f..8ddad4348 100644 --- a/siretisation/django/annotation/migrations/0002_annotation_closed_annotation_irrelevant_and_more.py +++ b/siretisation/django/annotation/migrations/0002_annotation_closed_annotation_irrelevant_and_more.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0001_initial"), ] diff --git a/siretisation/django/annotation/migrations/0003_annotation_created_by.py b/siretisation/django/annotation/migrations/0003_annotation_created_by.py index 54e8ae34f..eeca94094 100644 --- a/siretisation/django/annotation/migrations/0003_annotation_created_by.py +++ b/siretisation/django/annotation/migrations/0003_annotation_created_by.py @@ -6,7 +6,6 @@ class Migration(migrations.Migration): - dependencies = [ migrations.swappable_dependency(settings.AUTH_USER_MODEL), ("annotation", "0002_annotation_closed_annotation_irrelevant_and_more"), diff --git a/siretisation/django/annotation/migrations/0004_dataset_slug.py b/siretisation/django/annotation/migrations/0004_dataset_slug.py index 488d5716f..5780835c4 100644 --- a/siretisation/django/annotation/migrations/0004_dataset_slug.py +++ b/siretisation/django/annotation/migrations/0004_dataset_slug.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0003_annotation_created_by"), ] diff --git a/siretisation/django/annotation/migrations/0005_enable_unaccent.py b/siretisation/django/annotation/migrations/0005_enable_unaccent.py index 5072f937c..cb8772e04 100644 --- a/siretisation/django/annotation/migrations/0005_enable_unaccent.py +++ b/siretisation/django/annotation/migrations/0005_enable_unaccent.py @@ -3,7 +3,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0004_dataset_slug"), ] diff --git a/siretisation/django/annotation/migrations/0006_ds_priority_settings.py b/siretisation/django/annotation/migrations/0006_ds_priority_settings.py index 2e5cb16c4..6fffcfdcc 100644 --- a/siretisation/django/annotation/migrations/0006_ds_priority_settings.py +++ b/siretisation/django/annotation/migrations/0006_ds_priority_settings.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0005_enable_unaccent"), ] diff --git a/siretisation/django/annotation/migrations/0007_datasetrow_similar_address.py b/siretisation/django/annotation/migrations/0007_datasetrow_similar_address.py index f1daf4b76..bcd03efd6 100644 --- a/siretisation/django/annotation/migrations/0007_datasetrow_similar_address.py +++ b/siretisation/django/annotation/migrations/0007_datasetrow_similar_address.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0006_ds_priority_settings"), ] diff --git a/siretisation/django/annotation/migrations/0008_dataset_show_nearby_cnfs_permanences_and_more.py b/siretisation/django/annotation/migrations/0008_dataset_show_nearby_cnfs_permanences_and_more.py index d67faa837..85e03f3e2 100644 --- a/siretisation/django/annotation/migrations/0008_dataset_show_nearby_cnfs_permanences_and_more.py +++ b/siretisation/django/annotation/migrations/0008_dataset_show_nearby_cnfs_permanences_and_more.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0007_datasetrow_similar_address"), ] diff --git a/siretisation/django/annotation/migrations/0009_source.py b/siretisation/django/annotation/migrations/0009_source.py index 77d03fee3..4e932fba3 100644 --- a/siretisation/django/annotation/migrations/0009_source.py +++ b/siretisation/django/annotation/migrations/0009_source.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0008_dataset_show_nearby_cnfs_permanences_and_more"), ] diff --git a/siretisation/django/annotation/migrations/0010_data_from_warehouse.py b/siretisation/django/annotation/migrations/0010_data_from_warehouse.py index 9494f83e7..29e135c3b 100644 --- a/siretisation/django/annotation/migrations/0010_data_from_warehouse.py +++ b/siretisation/django/annotation/migrations/0010_data_from_warehouse.py @@ -15,7 +15,6 @@ def migrate_data(apps, _) -> None: class Migration(migrations.Migration): - dependencies = [ ("annotation", "0009_source"), ] diff --git a/siretisation/django/cnfs/migrations/0001_add_cnfs_permanences.py b/siretisation/django/cnfs/migrations/0001_add_cnfs_permanences.py index 5aaf4a798..966cbb631 100644 --- a/siretisation/django/cnfs/migrations/0001_add_cnfs_permanences.py +++ b/siretisation/django/cnfs/migrations/0001_add_cnfs_permanences.py @@ -7,7 +7,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [] diff --git a/siretisation/django/matching/migrations/0001_initial.py b/siretisation/django/matching/migrations/0001_initial.py index e44910bf2..ba338a80b 100644 --- a/siretisation/django/matching/migrations/0001_initial.py +++ b/siretisation/django/matching/migrations/0001_initial.py @@ -1,14 +1,14 @@ # Generated by Django 4.1.3 on 2023-01-31 18:34 -from django.conf import settings +import uuid + import django.contrib.postgres.fields -from django.db import migrations, models import django.db.models.deletion -import uuid +from django.conf import settings +from django.db import migrations, models class Migration(migrations.Migration): - initial = True dependencies = [ diff --git a/siretisation/django/matching/migrations/0002_add_no_matching_row_flag.py b/siretisation/django/matching/migrations/0002_add_no_matching_row_flag.py index 3f939c957..cded5f515 100644 --- a/siretisation/django/matching/migrations/0002_add_no_matching_row_flag.py +++ b/siretisation/django/matching/migrations/0002_add_no_matching_row_flag.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("matching", "0001_initial"), ] diff --git a/siretisation/django/sirene/migrations/0001_initial.py b/siretisation/django/sirene/migrations/0001_initial.py index 1e073dfe7..5364f1928 100644 --- a/siretisation/django/sirene/migrations/0001_initial.py +++ b/siretisation/django/sirene/migrations/0001_initial.py @@ -6,7 +6,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [] diff --git a/siretisation/django/sirene/migrations/0002_remove_establishment_full_text_trgm_idx_and_more.py b/siretisation/django/sirene/migrations/0002_remove_establishment_full_text_trgm_idx_and_more.py index 99d39e699..24d5ceb6c 100644 --- a/siretisation/django/sirene/migrations/0002_remove_establishment_full_text_trgm_idx_and_more.py +++ b/siretisation/django/sirene/migrations/0002_remove_establishment_full_text_trgm_idx_and_more.py @@ -5,7 +5,6 @@ class Migration(migrations.Migration): - dependencies = [ ("sirene", "0001_initial"), ] diff --git a/siretisation/django/sirene/migrations/0003_add_postgis.py b/siretisation/django/sirene/migrations/0003_add_postgis.py index 4efa86317..b699fb484 100644 --- a/siretisation/django/sirene/migrations/0003_add_postgis.py +++ b/siretisation/django/sirene/migrations/0003_add_postgis.py @@ -3,7 +3,6 @@ class Migration(migrations.Migration): - dependencies = [ ("sirene", "0002_remove_establishment_full_text_trgm_idx_and_more"), ] diff --git a/siretisation/django/sirene/migrations/0004_add_codenaf_table.py b/siretisation/django/sirene/migrations/0004_add_codenaf_table.py index 5e945f03d..8273e5c38 100644 --- a/siretisation/django/sirene/migrations/0004_add_codenaf_table.py +++ b/siretisation/django/sirene/migrations/0004_add_codenaf_table.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("sirene", "0003_add_postgis"), ] diff --git a/siretisation/django/sirene/migrations/0005_import_codenaf.py b/siretisation/django/sirene/migrations/0005_import_codenaf.py index 61c8a5c5c..6d1b2353c 100644 --- a/siretisation/django/sirene/migrations/0005_import_codenaf.py +++ b/siretisation/django/sirene/migrations/0005_import_codenaf.py @@ -78,7 +78,6 @@ def import_naf(apps, _) -> None: class Migration(migrations.Migration): - dependencies = [ ("sirene", "0004_add_codenaf_table"), ] diff --git a/siretisation/django/sirene/migrations/0006_alter_establishment_ape.py b/siretisation/django/sirene/migrations/0006_alter_establishment_ape.py index e80dcd812..33b6f3286 100644 --- a/siretisation/django/sirene/migrations/0006_alter_establishment_ape.py +++ b/siretisation/django/sirene/migrations/0006_alter_establishment_ape.py @@ -6,7 +6,6 @@ class Migration(migrations.Migration): - dependencies = [ ("sirene", "0005_import_codenaf"), ] diff --git a/siretisation/django/users/migrations/0001_initial.py b/siretisation/django/users/migrations/0001_initial.py index dfdf49fcb..906102260 100644 --- a/siretisation/django/users/migrations/0001_initial.py +++ b/siretisation/django/users/migrations/0001_initial.py @@ -5,7 +5,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [ @@ -41,7 +40,7 @@ class Migration(migrations.Migration): "is_active", models.BooleanField( default=True, - help_text="Designates whether this user should be treated as active. Unselect this instead of deleting accounts.", + help_text="Designates whether this user should be treated as active. Unselect this instead of deleting accounts.", # noqa: E501 verbose_name="active", ), ), @@ -51,7 +50,7 @@ class Migration(migrations.Migration): "groups", models.ManyToManyField( blank=True, - help_text="The groups this user belongs to. A user will get all permissions granted to each of their groups.", + help_text="The groups this user belongs to. A user will get all permissions granted to each of their groups.", # noqa: E501 related_name="user_set", related_query_name="user", to="auth.group", From 99daf448fabeb24849f32c88bca51e3ec427478d Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 31 Aug 2023 14:41:52 +0200 Subject: [PATCH 14/34] fix: use contact data for di --- .../data_inclusion/int_data_inclusion__services.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql b/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql index 75e77a173..34ccef117 100644 --- a/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql +++ b/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql @@ -17,9 +17,9 @@ di_profil_by_dora_profil AS ( final AS ( SELECT id AS "adresse_id", - contact_public AS "contact_public", - NULL AS "contact_nom_prenom", - NULL AS "courriel", + TRUE AS "contact_public", + contact_nom_prenom AS "contact_nom_prenom", + courriel AS "courriel", cumulable AS "cumulable", date_creation::DATE AS "date_creation", date_maj::DATE AS "date_maj", From 132ca506e4906bee7c2606b6452323eaaa36ac75 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Fri, 1 Sep 2023 11:43:34 +0200 Subject: [PATCH 15/34] chore(odspep): exclude res before 2021 --- .../staging/odspep/stg_odspep__res_partenariales.sql | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pipeline/dbt/models/staging/odspep/stg_odspep__res_partenariales.sql b/pipeline/dbt/models/staging/odspep/stg_odspep__res_partenariales.sql index 49c08c403..3dde09610 100644 --- a/pipeline/dbt/models/staging/odspep/stg_odspep__res_partenariales.sql +++ b/pipeline/dbt/models/staging/odspep/stg_odspep__res_partenariales.sql @@ -3,7 +3,7 @@ WITH source AS ( FROM {{ source('odspep', 'DD009_RES_PARTENARIALE') }} ), -final AS ( +ressources_partenariales AS ( SELECT "ID_RES" AS "id", "ID_RES" AS "id_res", @@ -30,6 +30,12 @@ final AS ( TO_DATE("DATE_FIN_VALID_RSP", 'YYYY-MM-DD') AS "date_fin_valid", TO_DATE("DATE_DERNIERE_MODIF_RSP", 'YYYY-MM-DD') AS "date_derniere_modif" FROM source +), + +final AS ( + SELECT * + FROM ressources_partenariales + WHERE date_derniere_modif IS NOT NULL AND EXTRACT(YEAR FROM date_derniere_modif) >= 2021 ) SELECT * FROM final From 554299efafdf4c67de944cfec044bf345fae73c3 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Fri, 1 Sep 2023 13:04:37 +0200 Subject: [PATCH 16/34] fix(airflow): urlencode conn strings --- .template.env | 2 ++ docker-compose.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.template.env b/.template.env index f4fdc1c39..cd9c1f8d6 100644 --- a/.template.env +++ b/.template.env @@ -35,6 +35,8 @@ ANNUAIRE_ENTREPRISES_API_URL=https://recherche-entreprises.api.gouv.fr ### sources ### +# airflow connections string *must* be urlencoded (using `urllib.parse.urlencode` for instance) + AGEFIPH_SERVICES_API_URL=https://www.agefiph.fr/jsonapi/node/aide_service AGEFIPH_STRUCTURES_FILE_URL= AIRFLOW_CONN_S3_SOURCES= diff --git a/docker-compose.yml b/docker-compose.yml index 09d027d9f..0c5ebbcc1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,7 +17,7 @@ x-airflow-common: AIRFLOW__WEBSERVER__WORKERS: 1 # Connections - AIRFLOW_CONN_S3: aws://@/data-inclusion-lake?endpoint_url=http://minio:9000&aws_access_key_id=minioadmin&aws_secret_access_key=minioadmin + AIRFLOW_CONN_S3: aws://@/data-inclusion-lake?endpoint_url=http%3A%2F%2Fminio%3A9000&aws_access_key_id=minioadmin&aws_secret_access_key=minioadmin AIRFLOW_CONN_S3_SOURCES: ${AIRFLOW_CONN_S3_SOURCES} AIRFLOW_CONN_PG: postgresql://data-inclusion:data-inclusion@target-db:5432/data-inclusion From be66beb210e4ddcb60a918b7a42a6267d62d3bd0 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Fri, 1 Sep 2023 14:08:11 +0200 Subject: [PATCH 17/34] fix(dbt): contact --- .../data_inclusion/int_data_inclusion__services.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql b/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql index 34ccef117..7a6436a58 100644 --- a/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql +++ b/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql @@ -18,7 +18,7 @@ final AS ( SELECT id AS "adresse_id", TRUE AS "contact_public", - contact_nom_prenom AS "contact_nom_prenom", + contact_nom AS "contact_nom_prenom", courriel AS "courriel", cumulable AS "cumulable", date_creation::DATE AS "date_creation", From a9f35740f09bb4f30fbdebe37fa8af1a4b3d9a3c Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Fri, 1 Sep 2023 15:38:31 +0200 Subject: [PATCH 18/34] fix(dbt): test int mednum uniqueness --- .../_mediation_numerique_models.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml b/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml index 3b6b5966d..0a84c83b7 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml +++ b/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml @@ -3,25 +3,31 @@ version: 2 models: - name: int_mediation_numerique__adresses tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - source + - id - check_adresse: config: severity: warn columns: - name: id tests: - - unique - not_null - dbt_utils.not_empty_string - name: int_mediation_numerique__services tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - source + - id - check_service: config: severity: warn columns: - name: id tests: - - unique - not_null - dbt_utils.not_empty_string - name: structure_id @@ -39,13 +45,16 @@ models: - name: int_mediation_numerique__structures tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - source + - id - check_structure: config: severity: warn columns: - name: id tests: - - unique - not_null - dbt_utils.not_empty_string - name: adresse_id From d0f3907611b0abc6560ed53fbd476b3a855f8ce0 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Wed, 6 Sep 2023 16:37:46 +0200 Subject: [PATCH 19/34] fix(soliguide): open services --- .../models/intermediate/soliguide/int_soliguide__services.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql index 174845b09..550f1cc77 100644 --- a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql +++ b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql @@ -91,6 +91,8 @@ open_services AS ( SELECT * FROM relevant_services WHERE + NOT close__actif + OR (close__date_debut IS NOT NULL OR close__date_fin IS NOT NULL) AND ( From 4591e0b794eee9f35249a3f042c8cd992dd2abd3 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 7 Sep 2023 11:59:53 +0200 Subject: [PATCH 20/34] feat(mednum): modes_orientation* & zone_diffusion* --- .../intermediate/int__union_services__enhanced.sql | 4 ++-- .../int_mediation_numerique__services.sql | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql b/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql index 2dfed33a2..f26b0dde5 100644 --- a/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql +++ b/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql @@ -14,11 +14,11 @@ adresses AS ( services_with_zone_diffusion AS ( SELECT {{ dbt_utils.star(from=ref('int__union_services'), relation_alias='services', except=["zone_diffusion_code", "zone_diffusion_nom"]) }}, - CASE services.source = ANY(ARRAY['monenfant', 'soliguide']) + CASE services.source = ANY(ARRAY['monenfant', 'soliguide']) OR services.source ~ 'mediation-numerique' WHEN TRUE THEN adresses.result_citycode ELSE services.zone_diffusion_code END AS "zone_diffusion_code", - CASE services.source = ANY(ARRAY['monenfant', 'soliguide']) + CASE services.source = ANY(ARRAY['monenfant', 'soliguide']) OR services.source ~ 'mediation-numerique' WHEN TRUE THEN adresses.commune ELSE services.zone_diffusion_nom END AS "zone_diffusion_nom" diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql index 225f4026a..91a4f742f 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql +++ b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql @@ -124,11 +124,17 @@ final AS ( TRUE AS "contact_public", NULL AS "contact_nom_prenom", CAST(structures.date_maj AS DATE) AS "date_maj", - NULL AS "zone_diffusion_type", + 'departement' AS "zone_diffusion_type", NULL AS "zone_diffusion_code", NULL AS "zone_diffusion_nom", - CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", - CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", + ARRAY_REMOVE( + ARRAY[ + CASE WHEN structures.telephone IS NOT NULL THEN 'telephoner' END, + CASE WHEN structures.courriel IS NOT NULL THEN 'envoyer-un-mail' END + ], + NULL + ) AS "modes_orientation_accompagnateur", + ARRAY_REMOVE(ARRAY[CASE WHEN structures.telephone IS NOT NULL THEN 'telephoner' END], NULL) AS "modes_orientation_beneficiaire", CAST(NULL AS TEXT) AS "frais_autres", CASE WHEN CARDINALITY(services.types) > 0 THEN services.types ELSE ARRAY['accompagnement'] END AS "types", ARRAY['en-presentiel'] AS "modes_accueil", From 7c64040136df091e59a4b279c525682bcf1d2ae2 Mon Sep 17 00:00:00 2001 From: vmttn Date: Tue, 12 Sep 2023 10:49:05 +0200 Subject: [PATCH 21/34] feat: upgrade schema to v0.10.0 (#117) --- api/requirements/dev-requirements.txt | 2 +- api/requirements/requirements.txt | 2 +- api/requirements/test-requirements.txt | 2 +- api/setup.py | 2 +- .../alembic/versions/06e3e22e0541_v0_10_0.py | 42 +++++++++++++ .../data_inclusion/api/entrypoints/fastapi.py | 20 ++++--- api/src/data_inclusion/api/models.py | 6 +- api/src/data_inclusion/api/schema.py | 59 ++++++++----------- api/tests/inclusion/factories.py | 6 +- api/tests/inclusion/test_api.py | 26 ++++---- .../macros/domain/checks/check_service.sql | 10 +++- .../agefiph/int_agefiph__services.sql | 6 +- .../int_data_inclusion__services.sql | 2 + .../intermediate/dora/int_dora__services.sql | 2 + .../intermediate/int__union_services.sql | 2 + .../int__union_services__enhanced.sql | 2 + .../int_mediation_numerique__services.sql | 6 +- .../monenfant/int_monenfant__services.sql | 12 ++-- .../odspep/int_odspep__services.sql | 6 +- .../soliguide/int_soliguide__services.sql | 6 +- pipeline/dbt/models/marts/api/_api_models.yml | 8 ++- .../stg_data_inclusion__services.sql | 4 +- .../staging/dora/stg_dora__services.sql | 4 +- .../dbt/seeds/schema/labels_nationaux.csv | 1 + pipeline/dbt/seeds/schema/profils.csv | 4 +- pipeline/dbt/seeds/schema/thematiques.csv | 14 ++--- ...s_cog.csv => zones_de_diffusion_types.csv} | 0 27 files changed, 164 insertions(+), 92 deletions(-) create mode 100644 api/src/alembic/versions/06e3e22e0541_v0_10_0.py rename pipeline/dbt/seeds/schema/{types_cog.csv => zones_de_diffusion_types.csv} (100%) diff --git a/api/requirements/dev-requirements.txt b/api/requirements/dev-requirements.txt index d71fbd407..182748962 100644 --- a/api/requirements/dev-requirements.txt +++ b/api/requirements/dev-requirements.txt @@ -43,7 +43,7 @@ cryptography==41.0.2 # via # data-inclusion-api (setup.py) # python-jose -data-inclusion-schema==0.9.1 +data-inclusion-schema==0.10.0 # via data-inclusion-api (setup.py) distlib==0.3.7 # via virtualenv diff --git a/api/requirements/requirements.txt b/api/requirements/requirements.txt index 906689481..a4224777b 100644 --- a/api/requirements/requirements.txt +++ b/api/requirements/requirements.txt @@ -32,7 +32,7 @@ cryptography==41.0.2 # via # data-inclusion-api (setup.py) # python-jose -data-inclusion-schema==0.9.1 +data-inclusion-schema==0.10.0 # via data-inclusion-api (setup.py) dnspython==2.4.1 # via email-validator diff --git a/api/requirements/test-requirements.txt b/api/requirements/test-requirements.txt index 9210fc276..897e641a3 100644 --- a/api/requirements/test-requirements.txt +++ b/api/requirements/test-requirements.txt @@ -43,7 +43,7 @@ cryptography==41.0.2 # via # data-inclusion-api (setup.py) # python-jose -data-inclusion-schema==0.9.1 +data-inclusion-schema==0.10.0 # via data-inclusion-api (setup.py) dnspython==2.4.1 # via email-validator diff --git a/api/setup.py b/api/setup.py index 1e1b413c1..47525acd0 100644 --- a/api/setup.py +++ b/api/setup.py @@ -30,7 +30,7 @@ "sentry-sdk[fastapi]", "sqlalchemy", "uvicorn[standard]", - "data-inclusion-schema==0.9.1", + "data-inclusion-schema==0.10.0", ], extras_require={ "test": [ diff --git a/api/src/alembic/versions/06e3e22e0541_v0_10_0.py b/api/src/alembic/versions/06e3e22e0541_v0_10_0.py new file mode 100644 index 000000000..dc3b1a4bf --- /dev/null +++ b/api/src/alembic/versions/06e3e22e0541_v0_10_0.py @@ -0,0 +1,42 @@ +"""v0.10.0 + +Revision ID: 06e3e22e0541 +Revises: 7f177bfb0108 +Create Date: 2023-09-11 15:34:37.042108 + +""" +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "06e3e22e0541" +down_revision = "7f177bfb0108" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "service", + sa.Column("modes_orientation_accompagnateur_autres", sa.Text(), nullable=True), + ) + op.add_column( + "service", + sa.Column("modes_orientation_beneficiaire_autres", sa.Text(), nullable=True), + ) + op.drop_column("service", "pre_requis") + op.drop_column("service", "justificatifs") + op.add_column( + "service", + sa.Column("pre_requis", postgresql.ARRAY(sa.Text()), nullable=True), + ) + op.add_column( + "service", + sa.Column("justificatifs", postgresql.ARRAY(sa.Text()), nullable=True), + ) + + +def downgrade() -> None: + pass diff --git a/api/src/data_inclusion/api/entrypoints/fastapi.py b/api/src/data_inclusion/api/entrypoints/fastapi.py index e0c32ab42..90107100a 100644 --- a/api/src/data_inclusion/api/entrypoints/fastapi.py +++ b/api/src/data_inclusion/api/entrypoints/fastapi.py @@ -278,7 +278,7 @@ def list_services( thematique: Optional[schema.Thematique] = None, departement: Optional[schema.DepartementCOG] = None, departement_slug: Optional[schema.DepartementSlug] = None, - code_insee: Optional[schema.CodeInsee] = None, + code_insee: Optional[schema.CodeCommune] = None, ): query = ( sqla.select(models.Service) @@ -364,7 +364,7 @@ def list_services_endpoint( schema.DepartementSlug | SkipJsonSchema[None], fastapi.Query() ] = None, code_insee: Annotated[ - schema.CodeInsee | SkipJsonSchema[None], fastapi.Query() + schema.CodeCommune | SkipJsonSchema[None], fastapi.Query() ] = None, ): return list_services( @@ -428,24 +428,28 @@ def search_services( query = query.filter( sqla.or_( models.Service.zone_diffusion_type.is_(None), - models.Service.zone_diffusion_type == schema.TypeCOG.PAYS.value, + models.Service.zone_diffusion_type + == schema.ZoneDiffusionType.PAYS.value, sqla.and_( - models.Service.zone_diffusion_type == schema.TypeCOG.COMMUNE.value, + models.Service.zone_diffusion_type + == schema.ZoneDiffusionType.COMMUNE.value, models.Service.zone_diffusion_code == commune_instance.code, ), sqla.and_( - models.Service.zone_diffusion_type == schema.TypeCOG.EPCI.value, + models.Service.zone_diffusion_type + == schema.ZoneDiffusionType.EPCI.value, sqla.literal(commune_instance.siren_epci).contains( models.Service.zone_diffusion_code ), ), sqla.and_( models.Service.zone_diffusion_type - == schema.TypeCOG.DEPARTEMENT.value, + == schema.ZoneDiffusionType.DEPARTEMENT.value, models.Service.zone_diffusion_code == commune_instance.departement, ), sqla.and_( - models.Service.zone_diffusion_type == schema.TypeCOG.REGION.value, + models.Service.zone_diffusion_type + == schema.ZoneDiffusionType.REGION.value, models.Service.zone_diffusion_code == commune_instance.region, ), ) @@ -598,7 +602,7 @@ def search_services_endpoint( ), ] = None, code_insee: Annotated[ - schema.CodeInsee | SkipJsonSchema[None], + schema.CodeCommune | SkipJsonSchema[None], fastapi.Query( description="""Code insee de la commune considérée. Si fourni, les résultats inclus également les services proches de cette commune. diff --git a/api/src/data_inclusion/api/models.py b/api/src/data_inclusion/api/models.py index c2a7949a1..40fe283b3 100644 --- a/api/src/data_inclusion/api/models.py +++ b/api/src/data_inclusion/api/models.py @@ -112,9 +112,9 @@ class Service(Base): frais = sqla.Column(ARRAY(sqla.Text), default=list) frais_autres = sqla.Column(sqla.Text, nullable=True) profils = sqla.Column(ARRAY(sqla.Text), default=list) - pre_requis = sqla.Column(sqla.Text, nullable=True) + pre_requis = sqla.Column(ARRAY(sqla.Text), default=list) cumulable = sqla.Column(sqla.Boolean, default=False) - justificatifs = sqla.Column(sqla.Text, nullable=True) + justificatifs = sqla.Column(ARRAY(sqla.Text), default=list) formulaire_en_ligne = sqla.Column(sqla.Text, nullable=True) commune = sqla.Column(sqla.Text, nullable=True) code_postal = sqla.Column(sqla.Text, nullable=True) @@ -136,7 +136,9 @@ class Service(Base): date_maj = sqla.Column(sqla.Date(), nullable=True) modes_accueil = sqla.Column(ARRAY(sqla.Text), default=list) modes_orientation_accompagnateur = sqla.Column(ARRAY(sqla.Text), default=list) + modes_orientation_accompagnateur_autres = sqla.Column(sqla.Text, nullable=True) modes_orientation_beneficiaire = sqla.Column(ARRAY(sqla.Text), default=list) + modes_orientation_beneficiaire_autres = sqla.Column(sqla.Text, nullable=True) zone_diffusion_type = sqla.Column(sqla.Text, nullable=True) zone_diffusion_code = sqla.Column(sqla.Text, nullable=True) zone_diffusion_nom = sqla.Column(sqla.Text, nullable=True) diff --git a/api/src/data_inclusion/api/schema.py b/api/src/data_inclusion/api/schema.py index f7f763368..40ace967d 100644 --- a/api/src/data_inclusion/api/schema.py +++ b/api/src/data_inclusion/api/schema.py @@ -1,12 +1,19 @@ from dataclasses import dataclass from datetime import date, datetime from enum import Enum -from typing import Optional, TypeAlias +from typing import Optional from pydantic import BaseModel, ConfigDict, EmailStr, Field, StringConstraints from typing_extensions import Annotated -from data_inclusion.schema.models import ( +from data_inclusion.schema import ( + CodeCommune, + CodeDepartement, + CodeEPCI, + CodePostal, + CodeRegion, + CodeRna, + CodeSiret, Frais, LabelNational, ModeAccueil, @@ -14,9 +21,9 @@ ModeOrientationBeneficiaire, Profil, Thematique, - TypeCOG, Typologie, TypologieService, + ZoneDiffusionType, ) @@ -146,19 +153,14 @@ class _Departement: {k: departement.cog for k, departement in _departements_dict.items()}, ) -CodePostal: TypeAlias = Annotated[ - str, StringConstraints(min_length=5, max_length=5, pattern=r"^\d{5}$") -] -CodeInsee: TypeAlias = Annotated[str, StringConstraints(min_length=5, max_length=5)] - class Service(BaseModel): model_config = ConfigDict(from_attributes=True, populate_by_name=True) # internal metadata - di_geocodage_code_insee: Optional[ - Annotated[str, StringConstraints(min_length=5, max_length=5)] - ] = Field(alias="_di_geocodage_code_insee") + di_geocodage_code_insee: Optional[CodeCommune] = Field( + alias="_di_geocodage_code_insee" + ) di_geocodage_score: Optional[Annotated[float, Field(ge=0, le=1)]] = Field( alias="_di_geocodage_score" ) @@ -178,13 +180,13 @@ class Service(BaseModel): frais: Optional[list[Frais]] = None frais_autres: Optional[str] = None profils: Optional[list[Profil]] = None - pre_requis: Optional[str] = None + pre_requis: Optional[list[str]] = None cumulable: Optional[bool] = None - justificatifs: Optional[str] = None + justificatifs: Optional[list[str]] = None formulaire_en_ligne: Optional[str] = None commune: Optional[str] = None code_postal: Optional[CodePostal] = None - code_insee: Optional[CodeInsee] = None + code_insee: Optional[CodeCommune] = None adresse: Optional[str] = None complement_adresse: Optional[str] = None longitude: Optional[float] = None @@ -202,13 +204,12 @@ class Service(BaseModel): modes_orientation_accompagnateur: Optional[ list[ModeOrientationAccompagnateur] ] = None + modes_orientation_accompagnateur_autres: Optional[str] = None modes_orientation_beneficiaire: Optional[list[ModeOrientationBeneficiaire]] = None - zone_diffusion_type: Optional[TypeCOG] = None + modes_orientation_beneficiaire_autres: Optional[str] = None + zone_diffusion_type: Optional[ZoneDiffusionType] = None zone_diffusion_code: Optional[ - Annotated[str, StringConstraints(pattern=r"^\w{5}$")] # code commune - | Annotated[str, StringConstraints(pattern=r"^\d{9}$")] # code epci - | Annotated[str, StringConstraints(pattern=r"^\w{2,3}$")] # code departement - | Annotated[str, StringConstraints(pattern=r"^\d{2}$")] # code region + CodeCommune | CodeEPCI | CodeDepartement | CodeRegion ] = None zone_diffusion_nom: Optional[str] = None @@ -217,29 +218,21 @@ class Structure(BaseModel): model_config = ConfigDict(from_attributes=True, populate_by_name=True) # internal metadata - di_geocodage_code_insee: Optional[ - Annotated[str, StringConstraints(min_length=5, max_length=5)] - ] = Field(alias="_di_geocodage_code_insee") + di_geocodage_code_insee: Optional[CodeCommune] = Field( + alias="_di_geocodage_code_insee" + ) di_geocodage_score: Optional[Annotated[float, Field(ge=0, le=1)]] = Field( alias="_di_geocodage_score" ) # structure data id: str - siret: Optional[ - Annotated[ - str, StringConstraints(min_length=14, max_length=14, pattern=r"^\d{14}$") - ] - ] = None - rna: Optional[ - Annotated[ - str, StringConstraints(min_length=10, max_length=10, pattern=r"^W\d{9}$") - ] - ] = None + siret: Optional[CodeSiret] = None + rna: Optional[CodeRna] = None nom: str commune: Optional[str] = None code_postal: Optional[CodePostal] = None - code_insee: Optional[CodeInsee] = None + code_insee: Optional[CodeCommune] = None adresse: Optional[str] = None complement_adresse: Optional[str] = None longitude: Optional[float] = None diff --git a/api/tests/inclusion/factories.py b/api/tests/inclusion/factories.py index c25cb963c..7149d87bd 100644 --- a/api/tests/inclusion/factories.py +++ b/api/tests/inclusion/factories.py @@ -126,9 +126,9 @@ class Meta: ), getter=lambda l: list(map(lambda t: t.value, l)), ) - pre_requis = None + pre_requis = [] cumulable = False - justificatifs = None + justificatifs = [] formulaire_en_ligne = None commune = factory.Faker("city", locale="fr_FR") code_postal = factory.Faker("postcode") @@ -162,12 +162,14 @@ class Meta: [schema.ModeOrientationAccompagnateur.ENVOYER_UN_MAIL.value], ] ) + modes_orientation_accompagnateur_autres = None modes_orientation_beneficiaire = factory.Iterator( [ [schema.ModeOrientationBeneficiaire.TELEPHONER.value], [schema.ModeOrientationBeneficiaire.SE_PRESENTER.value], ] ) + modes_orientation_beneficiaire_autres = None zone_diffusion_type = None zone_diffusion_code = None zone_diffusion_nom = None diff --git a/api/tests/inclusion/test_api.py b/api/tests/inclusion/test_api.py index 3810bc13e..0c47a7357 100644 --- a/api/tests/inclusion/test_api.py +++ b/api/tests/inclusion/test_api.py @@ -2,7 +2,7 @@ import pytest -from data_inclusion.api import schema +from data_inclusion import schema def test_list_structures_unauthenticated(api_client): @@ -329,9 +329,9 @@ def test_list_services_all(api_client, service_factory): "frais": ["gratuit", "gratuit-sous-conditions"], "frais_autres": "Camarade il.", "profils": ["femmes", "jeunes-16-26"], - "pre_requis": None, + "pre_requis": [], "cumulable": False, - "justificatifs": None, + "justificatifs": [], "formulaire_en_ligne": None, "commune": "Sainte Jacquelineboeuf", "code_postal": "25454", @@ -351,7 +351,9 @@ def test_list_services_all(api_client, service_factory): "date_maj": "2023-01-01", "modes_accueil": ["a-distance"], "modes_orientation_accompagnateur": ["telephoner"], + "modes_orientation_accompagnateur_autres": None, "modes_orientation_beneficiaire": ["telephoner"], + "modes_orientation_beneficiaire_autres": None, "zone_diffusion_type": None, "zone_diffusion_code": None, "zone_diffusion_nom": None, @@ -813,7 +815,7 @@ def test_search_services_with_zone_diffusion_pays( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.A_DISTANCE.value], - zone_diffusion_type=schema.TypeCOG.PAYS.value, + zone_diffusion_type=schema.ZoneDiffusionType.PAYS.value, zone_diffusion_code=None, zone_diffusion_nom=None, ) @@ -844,7 +846,7 @@ def test_search_services_with_zone_diffusion_commune( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.COMMUNE.value, + zone_diffusion_type=schema.ZoneDiffusionType.COMMUNE.value, zone_diffusion_code="59183", zone_diffusion_nom="Dunkerque", ) @@ -854,7 +856,7 @@ def test_search_services_with_zone_diffusion_commune( latitude=50.633333, longitude=3.066667, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.COMMUNE.value, + zone_diffusion_type=schema.ZoneDiffusionType.COMMUNE.value, zone_diffusion_code="59350", zone_diffusion_nom="Lille", ) @@ -885,7 +887,7 @@ def test_search_services_with_zone_diffusion_epci( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.EPCI.value, + zone_diffusion_type=schema.ZoneDiffusionType.EPCI.value, zone_diffusion_code="245900428", zone_diffusion_nom="CU de Dunkerque", ) @@ -895,7 +897,7 @@ def test_search_services_with_zone_diffusion_epci( latitude=50.633333, longitude=3.066667, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.EPCI.value, + zone_diffusion_type=schema.ZoneDiffusionType.EPCI.value, zone_diffusion_code="200093201", zone_diffusion_nom="Métropole Européenne de Lille", ) @@ -926,7 +928,7 @@ def test_search_services_with_zone_diffusion_departement( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.DEPARTEMENT.value, + zone_diffusion_type=schema.ZoneDiffusionType.DEPARTEMENT.value, zone_diffusion_code="59", zone_diffusion_nom="Nord", ) @@ -936,7 +938,7 @@ def test_search_services_with_zone_diffusion_departement( latitude=50.633333, longitude=3.066667, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.DEPARTEMENT.value, + zone_diffusion_type=schema.ZoneDiffusionType.DEPARTEMENT.value, zone_diffusion_code="62", zone_diffusion_nom="Pas-de-Calais", ) @@ -967,7 +969,7 @@ def test_search_services_with_zone_diffusion_region( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.REGION.value, + zone_diffusion_type=schema.ZoneDiffusionType.REGION.value, zone_diffusion_code="32", zone_diffusion_nom="Nord", ) @@ -977,7 +979,7 @@ def test_search_services_with_zone_diffusion_region( latitude=50.277500, longitude=3.973400, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.REGION.value, + zone_diffusion_type=schema.ZoneDiffusionType.REGION.value, zone_diffusion_code="44", zone_diffusion_nom="Grand Est", ) diff --git a/pipeline/dbt/macros/domain/checks/check_service.sql b/pipeline/dbt/macros/domain/checks/check_service.sql index 66431b9dc..d4b2d6978 100644 --- a/pipeline/dbt/macros/domain/checks/check_service.sql +++ b/pipeline/dbt/macros/domain/checks/check_service.sql @@ -11,11 +11,13 @@ CREATE OR REPLACE FUNCTION LIST_SERVICE_ERRORS( frais TEXT[], frais_autres TEXT, id TEXT, - justificatifs TEXT, + justificatifs TEXT[], lien_source TEXT, modes_accueil TEXT[], modes_orientation_accompagnateur TEXT[], + modes_orientation_accompagnateur_autres TEXT, modes_orientation_beneficiaire TEXT[], + modes_orientation_beneficiaire_autres TEXT, nom TEXT, presentation_detail TEXT, presentation_resume TEXT, @@ -30,7 +32,7 @@ CREATE OR REPLACE FUNCTION LIST_SERVICE_ERRORS( zone_diffusion_code TEXT, zone_diffusion_nom TEXT, zone_diffusion_type TEXT, - pre_requis TEXT + pre_requis TEXT[] ) RETURNS TABLE (field TEXT, value TEXT) AS $$ DECLARE @@ -50,7 +52,7 @@ BEGIN ("modes_orientation_accompagnateur", "modes_orientation_accompagnateur IS NULL OR modes_orientation_accompagnateur <@ ARRAY(SELECT m.value FROM " ~ ref('modes_orientation_accompagnateur') ~ "AS m)"), ("modes_orientation_beneficiaire", "modes_orientation_beneficiaire IS NULL OR modes_orientation_beneficiaire <@ ARRAY(SELECT m.value FROM " ~ ref('modes_orientation_beneficiaire') ~ "AS m)"), ("zone_diffusion_code", "zone_diffusion_code IS NULL OR zone_diffusion_code ~ '^(\d{9}|\w{5}|\w{2,3}|\d{2})$'"), - ("zone_diffusion_type", "zone_diffusion_type IS NULL OR zone_diffusion_type IN (SELECT t.value FROM " ~ ref('types_cog') ~ "AS t)"), + ("zone_diffusion_type", "zone_diffusion_type IS NULL OR zone_diffusion_type IN (SELECT t.value FROM " ~ ref('zones_de_diffusion_types') ~ "AS t)"), ] %} @@ -99,7 +101,9 @@ WITH final AS ( lien_source, modes_accueil, modes_orientation_accompagnateur, + modes_orientation_accompagnateur_autres, modes_orientation_beneficiaire, + modes_orientation_beneficiaire_autres, nom, presentation_detail, presentation_resume, diff --git a/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql b/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql index 308340b26..6e35397a0 100644 --- a/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql +++ b/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql @@ -56,7 +56,6 @@ final AS ( structures.courriel AS "courriel", NULL AS "formulaire_en_ligne", NULL AS "frais_autres", - NULL AS "justificatifs", services.attributes__title AS "nom", services.attributes__field_titre_card_employeur AS "presentation_resume", NULL AS "prise_rdv", @@ -67,7 +66,10 @@ final AS ( regions."REG" AS "zone_diffusion_code", regions."LIBELLE" AS "zone_diffusion_nom", 'region' AS "zone_diffusion_type", - NULL AS "pre_requis", + NULL AS "modes_orientation_accompagnateur_autres", + NULL AS "modes_orientation_beneficiaire_autres", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "pre_requis", CAST(NULL AS BOOLEAN) AS "cumulable", CAST(NULL AS DATE) AS "date_suspension", 'https://www.agefiph.fr' || services.attributes__path__alias AS "lien_source", diff --git a/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql b/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql index 7a6436a58..0785071a0 100644 --- a/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql +++ b/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql @@ -31,7 +31,9 @@ final AS ( NULL AS "lien_source", -- ignored modes_accueil AS "modes_accueil", NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", nom AS "nom", presentation_resume AS "presentation_resume", presentation_detail AS "presentation_detail", diff --git a/pipeline/dbt/models/intermediate/dora/int_dora__services.sql b/pipeline/dbt/models/intermediate/dora/int_dora__services.sql index 2a08c81cb..006009097 100644 --- a/pipeline/dbt/models/intermediate/dora/int_dora__services.sql +++ b/pipeline/dbt/models/intermediate/dora/int_dora__services.sql @@ -31,7 +31,9 @@ final AS ( lien_source AS "lien_source", modes_accueil AS "modes_accueil", NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", nom AS "nom", presentation_resume AS "presentation_resume", presentation_detail AS "presentation_detail", diff --git a/pipeline/dbt/models/intermediate/int__union_services.sql b/pipeline/dbt/models/intermediate/int__union_services.sql index c9a5f4bee..9fc05d4b4 100644 --- a/pipeline/dbt/models/intermediate/int__union_services.sql +++ b/pipeline/dbt/models/intermediate/int__union_services.sql @@ -17,9 +17,11 @@ WITH services AS ( "date_maj": "DATE", "date_suspension": "DATE", "frais": "TEXT[]", + "justificatifs": "TEXT[]", "modes_accueil": "TEXT[]", "modes_orientation_accompagnateur": "TEXT[]", "modes_orientation_beneficiaire": "TEXT[]", + "pre_requis": "TEXT[]", "profils": "TEXT[]", "thematiques": "TEXT[]", "types": "TEXT[]", diff --git a/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql b/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql index f26b0dde5..7ad393e4d 100644 --- a/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql +++ b/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql @@ -52,7 +52,9 @@ valid_services AS ( lien_source, modes_accueil, modes_orientation_accompagnateur, + modes_orientation_accompagnateur_autres, modes_orientation_beneficiaire, + modes_orientation_beneficiaire_autres, nom, presentation_detail, presentation_resume, diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql index 91a4f742f..a6e828a02 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql +++ b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql @@ -111,9 +111,9 @@ final AS ( services.structure_id AS "structure_id", services.thematiques AS "thematiques", services._di_source_id AS "source", - NULL AS "pre_requis", + CAST(NULL AS TEXT []) AS "pre_requis", CAST(NULL AS BOOLEAN) AS "cumulable", - NULL AS "justificatifs", + CAST(NULL AS TEXT []) AS "justificatifs", NULL AS "formulaire_en_ligne", NULL AS "recurrence", CAST(NULL AS DATE) AS "date_creation", @@ -134,7 +134,9 @@ final AS ( ], NULL ) AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", ARRAY_REMOVE(ARRAY[CASE WHEN structures.telephone IS NOT NULL THEN 'telephoner' END], NULL) AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", CAST(NULL AS TEXT) AS "frais_autres", CASE WHEN CARDINALITY(services.types) > 0 THEN services.types ELSE ARRAY['accompagnement'] END AS "types", ARRAY['en-presentiel'] AS "modes_accueil", diff --git a/pipeline/dbt/models/intermediate/monenfant/int_monenfant__services.sql b/pipeline/dbt/models/intermediate/monenfant/int_monenfant__services.sql index 64e7a17de..f2ce461ed 100644 --- a/pipeline/dbt/models/intermediate/monenfant/int_monenfant__services.sql +++ b/pipeline/dbt/models/intermediate/monenfant/int_monenfant__services.sql @@ -26,9 +26,7 @@ final AS ( NULL::TEXT [] AS "profils", id AS "structure_id", _di_source_id AS "source", - NULL AS "pre_requis", TRUE AS "cumulable", - NULL AS "justificatifs", NULL AS "formulaire_en_ligne", details_infos_pratiques_jour_horaire AS "recurrence", NULL::DATE AS "date_creation", @@ -39,10 +37,14 @@ final AS ( NULL AS "contact_nom_prenom", derniere_modif_date AS "date_maj", 'commune' AS "zone_diffusion_type", - NULL AS "zone_diffusion_code", -- will be overridden after geocoding - NULL AS "zone_diffusion_nom", -- will be overridden after geocoding - NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "zone_diffusion_code", + NULL AS "zone_diffusion_nom", + NULL::TEXT [] AS "modes_orientation_accompagnateur", -- will be overridden after geocoding + NULL AS "modes_orientation_accompagnateur_autres", -- will be overridden after geocoding NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", + NULL::TEXT [] AS "pre_requis", + NULL::TEXT [] AS "justificatifs", CASE WHEN avip THEN 'Crèche À Vocation d''Insertion Professionnelle' ELSE nom END AS "nom", ARRAY['payant'] AS "frais", ARRAY['famille--garde-denfants'] AS "thematiques", diff --git a/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql b/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql index 2c6885131..c992235d9 100644 --- a/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql +++ b/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql @@ -24,9 +24,9 @@ final AS ( NULL AS "prise_rdv", NULL::TEXT [] AS "frais", NULL AS "frais_autres", - NULL AS "pre_requis", + NULL::TEXT [] AS "pre_requis", NULL::BOOLEAN AS "cumulable", - NULL AS "justificatifs", + NULL::TEXT [] AS "justificatifs", NULL AS "formulaire_en_ligne", NULL AS "recurrence", NULL::DATE AS "date_creation", @@ -39,7 +39,9 @@ final AS ( date_derniere_modif AS "date_maj", NULL::TEXT [] AS "modes_accueil", NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", zone_diffusion_code AS "zone_diffusion_code", zone_diffusion_type AS "zone_diffusion_type", zone_diffusion_libelle AS "zone_diffusion_nom", diff --git a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql index 550f1cc77..5f0d965b0 100644 --- a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql +++ b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql @@ -117,9 +117,9 @@ final AS ( NULL::TEXT [] AS "frais", NULL AS "frais_autres", NULL::TEXT [] AS "profils", - NULL AS "pre_requis", + NULL::TEXT [] AS "pre_requis", TRUE AS "cumulable", - NULL AS "justificatifs", + NULL::TEXT [] AS "justificatifs", NULL::DATE AS "date_creation", NULL::DATE AS "date_suspension", filtered_phones.phone_number AS "telephone", @@ -133,7 +133,9 @@ final AS ( NULL AS "formulaire_en_ligne", open_services.lieu_id AS "structure_id", NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", ( SELECT di_thematique_by_soliguide_categorie_code.thematique FROM di_thematique_by_soliguide_categorie_code diff --git a/pipeline/dbt/models/marts/api/_api_models.yml b/pipeline/dbt/models/marts/api/_api_models.yml index f0f532ded..2c9052dab 100644 --- a/pipeline/dbt/models/marts/api/_api_models.yml +++ b/pipeline/dbt/models/marts/api/_api_models.yml @@ -143,11 +143,11 @@ models: - name: profils data_type: text[] - name: pre_requis - data_type: text + data_type: text[] - name: cumulable data_type: boolean - name: justificatifs - data_type: text + data_type: text[] - name: formulaire_en_ligne data_type: text - name: commune @@ -192,8 +192,12 @@ models: data_type: text[] - name: modes_orientation_accompagnateur data_type: text[] + - name: modes_orientation_accompagnateur_autres + data_type: text - name: modes_orientation_beneficiaire data_type: text[] + - name: modes_orientation_beneficiaire_autres + data_type: text - name: zone_diffusion_type data_type: text - name: zone_diffusion_code diff --git a/pipeline/dbt/models/staging/data_inclusion/stg_data_inclusion__services.sql b/pipeline/dbt/models/staging/data_inclusion/stg_data_inclusion__services.sql index e3f6ef3fa..4f6fee093 100644 --- a/pipeline/dbt/models/staging/data_inclusion/stg_data_inclusion__services.sql +++ b/pipeline/dbt/models/staging/data_inclusion/stg_data_inclusion__services.sql @@ -16,8 +16,8 @@ final AS ( ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils", ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques", ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types", - NULLIF(TRIM(data ->> 'justificatifs'), '') AS "justificatifs", - NULLIF(TRIM(data ->> 'pre_requis'), '') AS "pre_requis", + STRING_TO_ARRAY(NULLIF(TRIM(data ->> 'justificatifs'), ''), ',') AS "justificatifs", + STRING_TO_ARRAY(NULLIF(TRIM(data ->> 'pre_requis'), ''), ',') AS "pre_requis", data ->> 'adresse' AS "adresse", data ->> 'code_insee' AS "code_insee", data ->> 'code_postal' AS "code_postal", diff --git a/pipeline/dbt/models/staging/dora/stg_dora__services.sql b/pipeline/dbt/models/staging/dora/stg_dora__services.sql index 4f5e8c3e4..96a31ccf3 100644 --- a/pipeline/dbt/models/staging/dora/stg_dora__services.sql +++ b/pipeline/dbt/models/staging/dora/stg_dora__services.sql @@ -20,8 +20,8 @@ services AS ( ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils", ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques", ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types", - NULLIF(TRIM(data ->> 'justificatifs'), '') AS "justificatifs", - NULLIF(TRIM(data ->> 'pre_requis'), '') AS "pre_requis", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'justificatifs'))::TEXT [] AS "justificatifs", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'pre_requis'))::TEXT [] AS "pre_requis", data ->> 'adresse' AS "adresse", data ->> 'code_insee' AS "code_insee", data ->> 'code_postal' AS "code_postal", diff --git a/pipeline/dbt/seeds/schema/labels_nationaux.csv b/pipeline/dbt/seeds/schema/labels_nationaux.csv index 5ef4eb7f0..e432d1c86 100644 --- a/pipeline/dbt/seeds/schema/labels_nationaux.csv +++ b/pipeline/dbt/seeds/schema/labels_nationaux.csv @@ -34,6 +34,7 @@ emmaus,Emmaus, envie,Envie, epide,EPIDE, espace-emploi-agric-arrco,Espace Emploi Agirc Arrco, +etcld,Expérimentation territoriale contre le chômage de longue durée, fabrique-de-territoire,Fabrique de Territoire, face,Fondation FACE, fede-pro-fem,Federation Professionnelle Pour les Femmes, diff --git a/pipeline/dbt/seeds/schema/profils.csv b/pipeline/dbt/seeds/schema/profils.csv index 19c25b405..11de061b9 100644 --- a/pipeline/dbt/seeds/schema/profils.csv +++ b/pipeline/dbt/seeds/schema/profils.csv @@ -1,8 +1,8 @@ value,label,description adultes,Adultes, -beneficiaire-rsa,Bénéficiaire du Revenu de Solidarité Active (RSA), +beneficiaires-rsa,Bénéficiaires du Revenu de Solidarité Active (RSA), deficience-visuelle,Déficience visuelle, -demandeur-demploi,Demandeur ou demandeuse d’emploi, +demandeurs-demploi,Demandeurs ou demandeuses d’emploi, familles-enfants,Familles/enfants, femmes,Femmes,Le lieu propose des accompagnements réservés aux femmes. handicaps-mentaux,Handicaps mentaux : déficiences limitant les activités d’une personne, diff --git a/pipeline/dbt/seeds/schema/thematiques.csv b/pipeline/dbt/seeds/schema/thematiques.csv index 674f64cb0..015227a50 100644 --- a/pipeline/dbt/seeds/schema/thematiques.csv +++ b/pipeline/dbt/seeds/schema/thematiques.csv @@ -3,7 +3,7 @@ acces-aux-droits-et-citoyennete,Accès aux droits & citoyenneté, acces-aux-droits-et-citoyennete--accompagnement-dans-les-demarches-administratives,Accompagnement dans les démarches administratives, acces-aux-droits-et-citoyennete--accompagnement-juridique,Accompagnement juridique, acces-aux-droits-et-citoyennete--aide-aux-victimes,Aide aux victimes, -acces-aux-droits-et-citoyennete--connaitre-ses-droits,Connaitre ses droits, +acces-aux-droits-et-citoyennete--connaitre-ses-droits,Connaître ses droits, acces-aux-droits-et-citoyennete--demandeurs-dasile-et-naturalisation,Demandeurs d’asile et naturalisation, acces-aux-droits-et-citoyennete--developpement-durable,Développement durable, acces-aux-droits-et-citoyennete--faciliter-laction-citoyenne,Faciliter l’action citoyenne, @@ -26,11 +26,11 @@ creation-activite--developper-son-entreprise,Développer son entreprise, creation-activite--financer-son-projet,Financer son projet, creation-activite--reseautage-pour-createurs-dentreprise,Réseautage pour créateurs d’entreprise, creation-activite--structurer-son-projet-de-creation-dentreprise,Structurer son projet de création d’entreprise, -equipement-et-alimentation,Equipement et alimentation, +equipement-et-alimentation,Équipement et alimentation, equipement-et-alimentation--acces-a-du-materiel-informatique,Accès à du matériel informatique, equipement-et-alimentation--acces-a-un-telephone-et-un-abonnement,Accès à un téléphone et un abonnement, equipement-et-alimentation--alimentation,Alimentation, -equipement-et-alimentation--electromenager,Electroménager, +equipement-et-alimentation--electromenager,Électroménager, equipement-et-alimentation--habillement,Habillement, famille,Famille, famille--accompagnement-femme-enceinte-bebe-jeune-enfant,"Accompagnement femme enceinte, bébé, jeune enfant", @@ -53,7 +53,7 @@ handicap--accompagnement-par-une-structure-specialisee,Accompagnement par une st handicap--adaptation-au-poste-de-travail,Adaptation au poste de travail, handicap--adapter-son-logement,Adapter son logement, handicap--connaissance-des-droits-des-travailleurs,Connaissance des droits des travailleurs, -handicap--faire-reconnaitre-un-handicap,Faire reconnaitre un handicap, +handicap--faire-reconnaitre-un-handicap,Faire reconnaître un handicap, handicap--favoriser-le-retour-et-le-maintien-dans-lemploi,Favoriser le retour et le maintien dans l’emploi, handicap--gerer-le-depart-a-la-retraite-des-personnes-en-situation-de-handicap,Gérer le départ à la retraite des personnes en situation de handicap, handicap--mobilite-des-personnes-en-situation-de-handicap,Mobilité des personnes en situation de handicap, @@ -72,7 +72,7 @@ logement-hebergement,Logement et hébergement, logement-hebergement--besoin-dadapter-mon-logement,Besoin d’adapter mon logement, logement-hebergement--connaissance-de-ses-droits-et-interlocuteurs,Connaissance de ses droits et interlocuteurs, logement-hebergement--demenagement,Déménagement, -logement-hebergement--etre-accompagne-pour-se-loger,Etre accompagné(e) pour se loger, +logement-hebergement--etre-accompagne-pour-se-loger,Être accompagné(e) pour se loger, logement-hebergement--gerer-son-budget,Gérer son budget, logement-hebergement--mal-loges-sans-logis,Mal logé/sans logis, logement-hebergement--probleme-avec-son-logement,Problème avec son logement, @@ -84,10 +84,10 @@ mobilite--aides-a-la-reprise-demploi-ou-a-la-formation,Aides à la reprise d’e mobilite--apprendre-a-utiliser-un-deux-roues,Apprendre à utiliser un deux roues, mobilite--comprendre-et-utiliser-les-transports-en-commun,Comprendre et utiliser les transports en commun, mobilite--entretenir-reparer-son-vehicule,Entretenir ou réparer son véhicule, -mobilite--etre-accompagne-dans-son-parcours-mobilite,Etre accompagné(e) dans son parcours mobilité, +mobilite--etre-accompagne-dans-son-parcours-mobilite,Être accompagné(e) dans son parcours mobilité, mobilite--financer-mon-projet-mobilite,Financer mon projet mobilité, mobilite--louer-un-vehicule,"Louer un véhicule (voiture, vélo, scooter..)", -mobilite--preparer-son-permis-de-conduire-se-reentrainer-a-la-conduite,"Préparer son permis de conduire, se réentrainer à la conduite", +mobilite--preparer-son-permis-de-conduire-se-reentrainer-a-la-conduite,"Préparer son permis de conduire, se réentraîner à la conduite", numerique,Numérique, numerique--acceder-a-du-materiel,Accéder à du matériel, numerique--acceder-a-une-connexion-internet,Accéder à une connexion internet, diff --git a/pipeline/dbt/seeds/schema/types_cog.csv b/pipeline/dbt/seeds/schema/zones_de_diffusion_types.csv similarity index 100% rename from pipeline/dbt/seeds/schema/types_cog.csv rename to pipeline/dbt/seeds/schema/zones_de_diffusion_types.csv From 68bdae5b72bb313a976a97ed04beed36852d6a50 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Tue, 12 Sep 2023 11:55:37 +0200 Subject: [PATCH 22/34] test(dbt): fix dora staging --- pipeline/dbt/models/staging/dora/_dora__models.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pipeline/dbt/models/staging/dora/_dora__models.yml b/pipeline/dbt/models/staging/dora/_dora__models.yml index f0a5d4450..eec3408b4 100644 --- a/pipeline/dbt/models/staging/dora/_dora__models.yml +++ b/pipeline/dbt/models/staging/dora/_dora__models.yml @@ -87,7 +87,6 @@ models: - name: justificatifs tests: - dbt_utils.at_least_one - - dbt_utils.not_empty_string - name: latitude tests: - dbt_utils.at_least_one @@ -109,7 +108,6 @@ models: - name: pre_requis tests: - dbt_utils.at_least_one - - dbt_utils.not_empty_string - name: presentation_resume tests: - not_null From b3c7c2fe83bbfbdd2809174692c785278d28cb9c Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Tue, 12 Sep 2023 12:40:37 +0200 Subject: [PATCH 23/34] fix(api): v0.10.0 migration --- .../alembic/versions/06e3e22e0541_v0_10_0.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/api/src/alembic/versions/06e3e22e0541_v0_10_0.py b/api/src/alembic/versions/06e3e22e0541_v0_10_0.py index dc3b1a4bf..c6c51f5ca 100644 --- a/api/src/alembic/versions/06e3e22e0541_v0_10_0.py +++ b/api/src/alembic/versions/06e3e22e0541_v0_10_0.py @@ -17,15 +17,14 @@ depends_on = None +def column_exists(table_name, column_name): + bind = op.get_context().bind + insp = sa.inspect(bind) + columns = insp.get_columns(table_name) + return any(c["name"] == column_name for c in columns) + + def upgrade() -> None: - op.add_column( - "service", - sa.Column("modes_orientation_accompagnateur_autres", sa.Text(), nullable=True), - ) - op.add_column( - "service", - sa.Column("modes_orientation_beneficiaire_autres", sa.Text(), nullable=True), - ) op.drop_column("service", "pre_requis") op.drop_column("service", "justificatifs") op.add_column( @@ -37,6 +36,22 @@ def upgrade() -> None: sa.Column("justificatifs", postgresql.ARRAY(sa.Text()), nullable=True), ) + # these columns might have already been created by dbt + if not column_exists("service", "modes_orientation_accompagnateur_autres"): + op.add_column( + "service", + sa.Column( + "modes_orientation_accompagnateur_autres", sa.Text(), nullable=True + ), + ) + if not column_exists("service", "modes_orientation_beneficiaire_autres"): + op.add_column( + "service", + sa.Column( + "modes_orientation_beneficiaire_autres", sa.Text(), nullable=True + ), + ) + def downgrade() -> None: pass From 77dccfa12c6e5d431ea9609e67565aacae559080 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Tue, 12 Sep 2023 13:28:23 +0200 Subject: [PATCH 24/34] feat(dora): use modes_orientation_* --- .../intermediate/dora/int_dora__services.sql | 68 ++++++++-------- .../staging/dora/stg_dora__services.sql | 80 ++++++++++--------- 2 files changed, 76 insertions(+), 72 deletions(-) diff --git a/pipeline/dbt/models/intermediate/dora/int_dora__services.sql b/pipeline/dbt/models/intermediate/dora/int_dora__services.sql index 006009097..75a5bc042 100644 --- a/pipeline/dbt/models/intermediate/dora/int_dora__services.sql +++ b/pipeline/dbt/models/intermediate/dora/int_dora__services.sql @@ -16,44 +16,44 @@ di_profil_by_dora_profil AS ( final AS ( SELECT - id AS "adresse_id", - contact_public AS "contact_public", - NULL AS "contact_nom_prenom", -- ignored for now - NULL AS "courriel", -- ignored for now - cumulable AS "cumulable", - date_creation::DATE AS "date_creation", - date_maj::DATE AS "date_maj", - date_suspension::DATE AS "date_suspension", - formulaire_en_ligne AS "formulaire_en_ligne", - frais_autres AS "frais_autres", - id AS "id", - justificatifs AS "justificatifs", - lien_source AS "lien_source", - modes_accueil AS "modes_accueil", - NULL::TEXT [] AS "modes_orientation_accompagnateur", - NULL AS "modes_orientation_accompagnateur_autres", - NULL::TEXT [] AS "modes_orientation_beneficiaire", - NULL AS "modes_orientation_beneficiaire_autres", - nom AS "nom", - presentation_resume AS "presentation_resume", - presentation_detail AS "presentation_detail", - prise_rdv AS "prise_rdv", + id AS "adresse_id", + contact_public AS "contact_public", + NULL AS "contact_nom_prenom", -- ignored for now + NULL AS "courriel", -- ignored for now + cumulable AS "cumulable", + date_creation::DATE AS "date_creation", + date_maj::DATE AS "date_maj", + date_suspension::DATE AS "date_suspension", + formulaire_en_ligne AS "formulaire_en_ligne", + frais_autres AS "frais_autres", + id AS "id", + justificatifs AS "justificatifs", + lien_source AS "lien_source", + modes_accueil AS "modes_accueil", + modes_orientation_accompagnateur AS "modes_orientation_accompagnateur", + modes_orientation_accompagnateur_autres AS "modes_orientation_accompagnateur_autres", + modes_orientation_beneficiaire AS "modes_orientation_beneficiaire", + modes_orientation_beneficiaire_autres AS "modes_orientation_beneficiaire_autres", + nom AS "nom", + presentation_resume AS "presentation_resume", + presentation_detail AS "presentation_detail", + prise_rdv AS "prise_rdv", ARRAY( SELECT di_profil_by_dora_profil.di_profil FROM di_profil_by_dora_profil WHERE di_profil_by_dora_profil.dora_profil = ANY(services.profils) - )::TEXT [] AS "profils", - recurrence AS "recurrence", - _di_source_id AS "source", - structure_id AS "structure_id", - NULL AS "telephone", -- ignored for now - thematiques AS "thematiques", - types AS "types", - zone_diffusion_code AS "zone_diffusion_code", - zone_diffusion_nom AS "zone_diffusion_nom", - zone_diffusion_type AS "zone_diffusion_type", - pre_requis AS "pre_requis", - ARRAY[frais] AS "frais" + )::TEXT [] AS "profils", + recurrence AS "recurrence", + _di_source_id AS "source", + structure_id AS "structure_id", + NULL AS "telephone", -- ignored for now + thematiques AS "thematiques", + types AS "types", + zone_diffusion_code AS "zone_diffusion_code", + zone_diffusion_nom AS "zone_diffusion_nom", + zone_diffusion_type AS "zone_diffusion_type", + pre_requis AS "pre_requis", + ARRAY[frais] AS "frais" FROM services ) diff --git a/pipeline/dbt/models/staging/dora/stg_dora__services.sql b/pipeline/dbt/models/staging/dora/stg_dora__services.sql index 96a31ccf3..fa196b6dc 100644 --- a/pipeline/dbt/models/staging/dora/stg_dora__services.sql +++ b/pipeline/dbt/models/staging/dora/stg_dora__services.sql @@ -8,44 +8,48 @@ structures AS ( services AS ( SELECT - _di_source_id AS "_di_source_id", - (data ->> 'contact_public')::BOOLEAN AS "contact_public", - (data ->> 'cumulable')::BOOLEAN AS "cumulable", - (data ->> 'date_creation')::TIMESTAMP WITH TIME ZONE AS "date_creation", - (data ->> 'date_maj')::TIMESTAMP WITH TIME ZONE AS "date_maj", - (data ->> 'date_suspension')::TIMESTAMP WITH TIME ZONE AS "date_suspension", - (data ->> 'latitude')::FLOAT AS "latitude", - (data ->> 'longitude')::FLOAT AS "longitude", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_accueil'))::TEXT [] AS "modes_accueil", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'justificatifs'))::TEXT [] AS "justificatifs", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'pre_requis'))::TEXT [] AS "pre_requis", - data ->> 'adresse' AS "adresse", - data ->> 'code_insee' AS "code_insee", - data ->> 'code_postal' AS "code_postal", - data ->> 'commune' AS "commune", - data ->> 'complement_adresse' AS "complement_adresse", - NULLIF(TRIM(data ->> 'contact_nom'), '') AS "contact_nom", - NULLIF(TRIM(data ->> 'contact_prenom'), '') AS "contact_prenom", - NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel", - data ->> 'formulaire_en_ligne' AS "formulaire_en_ligne", - data ->> 'frais_autres' AS "frais_autres", - data ->> 'frais' AS "frais", - data ->> 'id' AS "id", - data ->> 'lien_source' AS "lien_source", - data ->> 'nom' AS "nom", - data ->> 'presentation_resume' AS "presentation_resume", - data ->> 'presentation_detail' AS "presentation_detail", - data ->> 'prise_rdv' AS "prise_rdv", - data ->> 'recurrence' AS "recurrence", - data ->> 'source' AS "source", - data ->> 'structure_id' AS "structure_id", - NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone", - NULLIF(TRIM(data ->> 'zone_diffusion_code'), '') AS "zone_diffusion_code", - NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom", - data ->> 'zone_diffusion_type' AS "zone_diffusion_type" + _di_source_id AS "_di_source_id", + (data ->> 'contact_public')::BOOLEAN AS "contact_public", + (data ->> 'cumulable')::BOOLEAN AS "cumulable", + (data ->> 'date_creation')::TIMESTAMP WITH TIME ZONE AS "date_creation", + (data ->> 'date_maj')::TIMESTAMP WITH TIME ZONE AS "date_maj", + (data ->> 'date_suspension')::TIMESTAMP WITH TIME ZONE AS "date_suspension", + (data ->> 'latitude')::FLOAT AS "latitude", + (data ->> 'longitude')::FLOAT AS "longitude", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_accueil'))::TEXT [] AS "modes_accueil", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'justificatifs'))::TEXT [] AS "justificatifs", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'pre_requis'))::TEXT [] AS "pre_requis", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_accompagnateur'))::TEXT [] AS "modes_orientation_accompagnateur", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_beneficiaire'))::TEXT [] AS "modes_orientation_beneficiaire", + data ->> 'modes_orientation_accompagnateur_autres' AS "modes_orientation_accompagnateur_autres", + data ->> 'modes_orientation_beneficiaire_autres' AS "modes_orientation_beneficiaire_autres", + data ->> 'adresse' AS "adresse", + data ->> 'code_insee' AS "code_insee", + data ->> 'code_postal' AS "code_postal", + data ->> 'commune' AS "commune", + data ->> 'complement_adresse' AS "complement_adresse", + NULLIF(TRIM(data ->> 'contact_nom'), '') AS "contact_nom", + NULLIF(TRIM(data ->> 'contact_prenom'), '') AS "contact_prenom", + NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel", + data ->> 'formulaire_en_ligne' AS "formulaire_en_ligne", + data ->> 'frais_autres' AS "frais_autres", + data ->> 'frais' AS "frais", + data ->> 'id' AS "id", + data ->> 'lien_source' AS "lien_source", + data ->> 'nom' AS "nom", + data ->> 'presentation_resume' AS "presentation_resume", + data ->> 'presentation_detail' AS "presentation_detail", + data ->> 'prise_rdv' AS "prise_rdv", + data ->> 'recurrence' AS "recurrence", + data ->> 'source' AS "source", + data ->> 'structure_id' AS "structure_id", + NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone", + NULLIF(TRIM(data ->> 'zone_diffusion_code'), '') AS "zone_diffusion_code", + NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom", + data ->> 'zone_diffusion_type' AS "zone_diffusion_type" FROM source ), From 798aa900d27b9264d596f4344cf55701300eb830 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Wed, 13 Sep 2023 11:28:34 +0200 Subject: [PATCH 25/34] refactor: use dl url as extract input --- .template.env | 4 +- analyse/notebooks/grist/template.ipynb | 199 ++++++++++++++++++ docker-compose.yml | 4 +- pipeline/dags/dags/settings.py | 31 +-- pipeline/dags/import_sources.py | 5 +- .../src/data_inclusion/scripts/tasks/grist.py | 119 +++++++++++ 6 files changed, 346 insertions(+), 16 deletions(-) create mode 100644 analyse/notebooks/grist/template.ipynb create mode 100644 pipeline/src/data_inclusion/scripts/tasks/grist.py diff --git a/.template.env b/.template.env index cd9c1f8d6..b55c2f623 100644 --- a/.template.env +++ b/.template.env @@ -42,7 +42,8 @@ AGEFIPH_STRUCTURES_FILE_URL= AIRFLOW_CONN_S3_SOURCES= BAN_API_URL=https://api-adresse.data.gouv.fr CD35_FILE_URL=https://data.ille-et-vilaine.fr/dataset/8d5ec0f0-ebe1-442d-9d99-655b37d5ad07/resource/8b781e9d-e11d-486c-98cf-0f63abfae8ed/download/annuaire_sociale_fixe.csv -CD72_FILE_URL= +CD72_STRUCTURES_FILE_URL=https://grist.incubateur.net/o/datainclusion/api/docs/dFpXXzs2fug9Kb7zZhyWyn/download/csv?tableId=Structures +CD72_SERVICES_FILE_URL=https://grist.incubateur.net/o/datainclusion/api/docs/dFpXXzs2fug9Kb7zZhyWyn/download/csv?tableId=Services DI_EXTRA_SERVICES_FILE_URL=https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/data-inclusion/2023-08-16/services.json DI_EXTRA_STRUCTURES_FILE_URL=https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/data-inclusion/2023-08-16/structures.json DORA_API_TOKEN= @@ -51,6 +52,7 @@ EMPLOIS_API_TOKEN= EMPLOIS_API_URL=https://emplois.inclusion.beta.gouv.fr/api/v1/structures/ ETAB_PUB_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/73302880-e4df-4d4c-8676-1a61bb997f3d FINESS_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/3dc9b1d5-0157-440d-a7b5-c894fcfdfd45 +GRIST_API_TOKEN= IGN_ADMIN_EXPRESS_FILE_URL=http://files.opendatarchives.fr/professionnels.ign.fr/adminexpress/ADMIN-EXPRESS-COG_3-0__SHP__FRA_WM_2021-05-19.7z IMMERSION_FACILITEE_S3_KEY_PREFIX=sources/immersion-facilitee/2023-03-06/after-siretisation-auto/ INSEE_FIRSTNAME_FILE_URL=https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip diff --git a/analyse/notebooks/grist/template.ipynb b/analyse/notebooks/grist/template.ipynb new file mode 100644 index 000000000..71d160ede --- /dev/null +++ b/analyse/notebooks/grist/template.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -e ../../../pipeline\n", + "%pip install -e ../../../../data-inclusion-schema\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import dotenv\n", + "import pandas as pd\n", + "\n", + "from data_inclusion.scripts.tasks import grist\n", + "from data_inclusion import schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dotenv.load_dotenv(dotenv.find_dotenv())\n", + "\n", + "GRIST_API_TOKEN = os.environ[\"GRIST_API_TOKEN\"]\n", + "GRIST_API_URL = \"https://grist.incubateur.net/api\"\n", + "WORKSPACE_ID = \"27\"\n", + "DOCUMENT_NAME = \"template\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grist_client = grist.GristClient(base_url=GRIST_API_URL, token=GRIST_API_TOKEN)\n", + "\n", + "document_id = grist_client.create_document(\n", + " workspace_id=WORKSPACE_ID, document_name=DOCUMENT_NAME\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for referentiel in [\n", + " \"frais\",\n", + " \"labels_nationaux\",\n", + " \"modes_accueil\",\n", + " \"modes_orientation_accompagnateur\",\n", + " \"modes_orientation_beneficiaire\",\n", + " \"profils\",\n", + " \"thematiques\",\n", + " \"typologies_de_services\",\n", + " \"typologies_de_structures\",\n", + " \"zones_de_diffusion_types\",\n", + "]:\n", + " table_id = grist_client.create_table(\n", + " document_id=document_id,\n", + " table_name=referentiel.capitalize(),\n", + " columns=[\n", + " {\"id\": \"value\", \"fields\": {\"label\": \"valeur\", \"type\": \"Text\"}},\n", + " {\"id\": \"label\", \"fields\": {\"label\": \"label\", \"type\": \"Text\"}},\n", + " ],\n", + " )\n", + "\n", + " referentiel_df = pd.read_csv(\n", + " f\"../../../pipeline/dbt/seeds/schema/{referentiel}.csv\",\n", + " dtype=str,\n", + " )\n", + "\n", + " # attention: pas idempotent\n", + "\n", + " grist_client.add_records(\n", + " document_id=document_id,\n", + " table_id=table_id,\n", + " records=[\n", + " {\"fields\": value_dict}\n", + " for value_dict in referentiel_df[[\"value\", \"label\"]].to_dict(\n", + " orient=\"records\"\n", + " )\n", + " ],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "\n", + "def get_column_type(field) -> str:\n", + " match_referentiel = re.search(\n", + " r\"data_inclusion.schema.(?P\\w+)\", str(field.annotation)\n", + " )\n", + "\n", + " if match_referentiel is not None:\n", + " return \"Ref:\" + match_referentiel.group(\"referentiel\").capitalize()\n", + " elif \"float\" in str(field.annotation):\n", + " return \"Numeric\"\n", + " elif \"bool\" in str(field.annotation):\n", + " return \"Bool\"\n", + " elif \"date\" in str(field.annotation):\n", + " return \"DateTime:Europe/Paris\"\n", + "\n", + " return \"Text\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grist_columns = [\n", + " {\n", + " \"id\": field_name,\n", + " \"fields\": {\n", + " \"label\": field_name,\n", + " \"type\": get_column_type(field_info),\n", + " # \"visibleCol\": TODO\n", + " },\n", + " }\n", + " for field_name, field_info in schema.Structure.model_fields.items()\n", + "]\n", + "\n", + "grist_client.create_table(\n", + " document_id=document_id,\n", + " table_name=\"Structures\",\n", + " columns=grist_columns,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grist_columns = [\n", + " {\n", + " \"id\": field_name,\n", + " \"fields\": {\n", + " \"label\": field_name,\n", + " \"type\": get_column_type(field_info),\n", + " # \"visibleCol\": TODO\n", + " },\n", + " }\n", + " for field_name, field_info in schema.Service.model_fields.items()\n", + "]\n", + "\n", + "grist_client.create_table(\n", + " document_id=document_id,\n", + " table_name=\"Services\",\n", + " columns=grist_columns,\n", + ")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docker-compose.yml b/docker-compose.yml index 0c5ebbcc1..14050c125 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -27,7 +27,8 @@ x-airflow-common: AIRFLOW_VAR_DBT_PROJECT_DIR: /opt/airflow/dbt AIRFLOW_VAR_BAN_API_URL: ${BAN_API_URL} AIRFLOW_VAR_CD35_FILE_URL: ${CD35_FILE_URL} - AIRFLOW_VAR_CD72_FILE_URL: ${CD72_FILE_URL} + AIRFLOW_VAR_CD72_STRUCTURES_FILE_URL: ${CD72_STRUCTURES_FILE_URL} + AIRFLOW_VAR_CD72_SERVICES_FILE_URL: ${CD72_SERVICES_FILE_URL} AIRFLOW_VAR_DATAGOUV_API_KEY: ${DATAGOUV_API_KEY} AIRFLOW_VAR_DATAGOUV_API_URL: ${DATAGOUV_API_URL} AIRFLOW_VAR_DATAGOUV_DI_DATASET_ID: ${DATAGOUV_DI_DATASET_ID} @@ -40,6 +41,7 @@ x-airflow-common: AIRFLOW_VAR_EMPLOIS_API_URL: ${EMPLOIS_API_URL} AIRFLOW_VAR_ETAB_PUB_FILE_URL: ${ETAB_PUB_FILE_URL} AIRFLOW_VAR_FINESS_FILE_URL: ${FINESS_FILE_URL} + AIRFLOW_VAR_GRIST_API_TOKEN: ${GRIST_API_TOKEN} AIRFLOW_VAR_IGN_ADMIN_EXPRESS_FILE_URL: ${IGN_ADMIN_EXPRESS_FILE_URL} AIRFLOW_VAR_IMMERSION_FACILITEE_S3_KEY_PREFIX: ${IMMERSION_FACILITEE_S3_KEY_PREFIX} AIRFLOW_VAR_INSEE_FIRSTNAME_FILE_URL: ${INSEE_FIRSTNAME_FILE_URL} diff --git a/pipeline/dags/dags/settings.py b/pipeline/dags/dags/settings.py index 865aea880..1648b3bce 100644 --- a/pipeline/dags/dags/settings.py +++ b/pipeline/dags/dags/settings.py @@ -82,18 +82,6 @@ }, ], }, - { - "id": "cd72", - "schedule_interval": "@once", - "snapshot": False, - "streams": [ - { - "id": "rows", - "filename": "rows.xlsx", - "url": Variable.get("CD72_FILE_URL", None), - }, - ], - }, { "id": "emplois-de-linclusion", "schedule_interval": "@daily", @@ -257,4 +245,23 @@ }, ], }, + { + "id": "cd72", + "schedule_interval": "@daily", + "snapshot": False, + "streams": [ + { + "id": "structures", + "filename": "structures.csv", + "url": Variable.get("CD72_STRUCTURES_FILE_URL", None), + "token": Variable.get("GRIST_API_TOKEN", None), + }, + { + "id": "services", + "filename": "services.csv", + "url": Variable.get("CD72_SERVICES_FILE_URL", None), + "token": Variable.get("GRIST_API_TOKEN", None), + }, + ], + }, ] diff --git a/pipeline/dags/import_sources.py b/pipeline/dags/import_sources.py index 3a3a8c1f4..036773ac5 100644 --- a/pipeline/dags/import_sources.py +++ b/pipeline/dags/import_sources.py @@ -58,6 +58,7 @@ def _extract( from data_inclusion.scripts.tasks import ( dora, emplois_de_linclusion, + grist, mediation_numerique, mes_aides, soliguide, @@ -72,7 +73,7 @@ def _extract( "agefiph": utils.extract_http_content, "annuaire-du-service-public": utils.extract_http_content, "cd35": utils.extract_http_content, - "cd72": utils.extract_http_content, + "cd72": grist.extract, "data-inclusion": utils.extract_http_content, "dora": dora.extract, "emplois-de-linclusion": emplois_de_linclusion.extract, @@ -138,7 +139,7 @@ def _load( READ_FN_BY_SOURCE_ID = { "annuaire-du-service-public": annuaire_du_service_public.read, "cd35": lambda path: utils.read_csv(path, sep=";"), - "cd72": lambda path: utils.read_excel(path, sheet_name="Structures"), + "cd72": lambda path: utils.read_csv(path, sep=","), "data-inclusion": utils.read_json, "dora": utils.read_json, "emplois-de-linclusion": utils.read_json, diff --git a/pipeline/src/data_inclusion/scripts/tasks/grist.py b/pipeline/src/data_inclusion/scripts/tasks/grist.py new file mode 100644 index 000000000..9abe72a5d --- /dev/null +++ b/pipeline/src/data_inclusion/scripts/tasks/grist.py @@ -0,0 +1,119 @@ +import logging +import re + +import requests + +logger = logging.getLogger(__name__) + + +def log_and_raise(resp: requests.Response, *args, **kwargs): + try: + resp.raise_for_status() + except requests.HTTPError as err: + logger.error(resp.json()) + raise err + + +class GristClient: + def __init__(self, base_url: str, token: str) -> None: + self.base_url = base_url.rstrip("/") + self.session = requests.Session() + self.session.hooks["response"] = [log_and_raise] + self.session.headers.update({"Authorization": f"Bearer {token}"}) + + def _create_document(self, workspace_id: str, document_name: str) -> str: + return self.session.post( + self.base_url + f"/workspaces/{workspace_id}/docs", + json={"name": document_name}, + ).json() + + def create_document(self, workspace_id: str, document_name: str) -> str: + workspace_dict = self.describe_workspace(workspace_id=workspace_id) + + existing_document_dict = next( + ( + document_dict + for document_dict in workspace_dict["docs"] + if document_dict["name"] == document_name + ), + None, + ) + + if existing_document_dict is not None: + logger.warning( + f"A document with name '{document_name}' already exists in workspace." + ) + return existing_document_dict["id"] + + return self._create_document( + workspace_id=workspace_id, document_name=document_name + ) + + def _create_table(self, document_id: str, table_name: str, columns: list) -> str: + return self.session.post( + self.base_url + f"/docs/{document_id}/tables", + json={"tables": [{"id": table_name, "columns": columns}]}, + ).json()["tables"][0]["id"] + + def list_tables(self, document_id: str) -> list: + return self.session.get( + self.base_url + f"/docs/{document_id}/tables", + ).json()["tables"] + + def create_table(self, document_id: str, table_name: str, columns: list) -> str: + tables_list = self.list_tables(document_id=document_id) + + existing_table_dict = next( + ( + table_dict + for table_dict in tables_list + if table_dict["id"] == table_name + ), + None, + ) + + if existing_table_dict is not None: + logger.warning( + f"A table with name '{table_name}' already exists in document." + ) + return existing_table_dict["id"] + + return self._create_table( + document_id=document_id, table_name=table_name, columns=columns + ) + + def describe_workspace(self, workspace_id: str): + # https://support.getgrist.com/api/#tag/workspaces/paths/~1workspaces~1%7BworkspaceId%7D/get + return self.session.get(self.base_url + f"/workspaces/{workspace_id}").json() + + def download_table_content_as_csv(self, document_id: str, table_id: str) -> bytes: + # https://support.getgrist.com/api/#tag/docs/paths/~1docs~1%7BdocId%7D~1download~1csv/get + return self.session.get( + self.base_url + f"/docs/{document_id}/download/csv", + params={"tableId": table_id}, + ).content + + def add_records(self, document_id: str, table_id: str, records: list): + # https://support.getgrist.com/api/#tag/records/paths/~1docs~1%7BdocId%7D~1tables~1%7BtableId%7D~1records/post + return self.session.post( + self.base_url + f"/docs/{document_id}/tables/{table_id}/records", + json={"records": records}, + ) + + +def extract(url: str, token: str, **kwargs) -> bytes: + match = re.search( + r"(?P.+)/docs/(?P\w+)/download/csv\?.*tableId=(?P\w+)", # noqa: E501 + url, + ) + + if match is None: + raise Exception("Invalid url") + + base_url, document_id, table_id = match.groups() + + grist_client = GristClient(base_url=base_url, token=token) + + return grist_client.download_table_content_as_csv( + document_id=document_id, table_id=table_id + ) From 907e403e424ca6c99fc304d91a924c4329b4ece9 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Wed, 13 Sep 2023 12:36:05 +0200 Subject: [PATCH 26/34] fix(dora): update profils mapping --- pipeline/dbt/models/intermediate/dora/int_dora__services.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline/dbt/models/intermediate/dora/int_dora__services.sql b/pipeline/dbt/models/intermediate/dora/int_dora__services.sql index 75a5bc042..81aaa8d37 100644 --- a/pipeline/dbt/models/intermediate/dora/int_dora__services.sql +++ b/pipeline/dbt/models/intermediate/dora/int_dora__services.sql @@ -9,8 +9,8 @@ di_profil_by_dora_profil AS ( VALUES ('Adultes', 'adultes'), ('Femmes', 'femmes'), - ('Public bénéficiaire du Revenu de Solidarité Active (RSA)', 'beneficiaire-rsa'), - ('Demandeur d''emploi', 'demandeur-demploi') + ('Public bénéficiaire du Revenu de Solidarité Active (RSA)', 'beneficiaires-rsa'), + ('Demandeur d''emploi', 'demandeurs-demploi') ) AS x (dora_profil, di_profil) ), From 9dc0bc3c841df0a106ad4debc9ebf948c66dd637 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Wed, 13 Sep 2023 12:56:21 +0200 Subject: [PATCH 27/34] feat(cd72): update with grist data --- pipeline/dbt/models/_sources.yml | 3 +- pipeline/dbt/models/datalake.sql | 3 +- .../intermediate/cd72/_cd72__models.yml | 24 +++++++ .../intermediate/cd72/int_cd72__adresses.sql | 52 +++++++++++----- .../intermediate/cd72/int_cd72__services.sql | 44 +++++++++++++ .../cd72/int_cd72__structures.sql | 62 ++++++++----------- .../intermediate/int__union_services.sql | 1 + .../dbt/models/staging/cd72/_cd72__models.yml | 26 +++++++- .../models/staging/cd72/stg_cd72__rows.sql | 26 -------- .../staging/cd72/stg_cd72__services.sql | 44 +++++++++++++ .../staging/cd72/stg_cd72__structures.sql | 24 +++++++ 11 files changed, 225 insertions(+), 84 deletions(-) create mode 100644 pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql delete mode 100644 pipeline/dbt/models/staging/cd72/stg_cd72__rows.sql create mode 100644 pipeline/dbt/models/staging/cd72/stg_cd72__services.sql create mode 100644 pipeline/dbt/models/staging/cd72/stg_cd72__structures.sql diff --git a/pipeline/dbt/models/_sources.yml b/pipeline/dbt/models/_sources.yml index 8d1214a43..ffae9f7ef 100644 --- a/pipeline/dbt/models/_sources.yml +++ b/pipeline/dbt/models/_sources.yml @@ -180,7 +180,8 @@ sources: - name: cd72 schema: cd72 tables: - - name: rows + - name: structures + - name: services - name: emplois_de_linclusion schema: emplois_de_linclusion diff --git a/pipeline/dbt/models/datalake.sql b/pipeline/dbt/models/datalake.sql index c77d4d451..628e09ab2 100644 --- a/pipeline/dbt/models/datalake.sql +++ b/pipeline/dbt/models/datalake.sql @@ -12,7 +12,8 @@ WITH source AS ( relations=[ source('annuaire_du_service_public', 'etablissements'), source('cd35', 'organisations'), - source('cd72', 'rows'), + source('cd72', 'structures'), + source('cd72', 'services'), source('dora', 'structures'), source('dora', 'services'), source('emplois_de_linclusion', 'siaes'), diff --git a/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml b/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml index f5cf7c449..8b1df2eb2 100644 --- a/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml +++ b/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml @@ -13,6 +13,30 @@ models: - not_null - dbt_utils.not_empty_string + - name: int_cd72__services + tests: + - check_service: + config: + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_cd72__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_cd72__adresses') + field: id + - name: int_cd72__structures tests: - check_structure: diff --git a/pipeline/dbt/models/intermediate/cd72/int_cd72__adresses.sql b/pipeline/dbt/models/intermediate/cd72/int_cd72__adresses.sql index ffaa90b03..9be94f983 100644 --- a/pipeline/dbt/models/intermediate/cd72/int_cd72__adresses.sql +++ b/pipeline/dbt/models/intermediate/cd72/int_cd72__adresses.sql @@ -1,25 +1,43 @@ -WITH raw_rows AS ( - SELECT * FROM {{ ref('stg_cd72__rows') }} +WITH structures AS ( + SELECT * FROM {{ ref('stg_cd72__structures') }} ), -rows_with_id AS ( - SELECT * - FROM raw_rows - WHERE id IS NOT NULL +services AS ( + SELECT * FROM {{ ref('stg_cd72__services') }} ), -final AS ( +structure_adresses AS ( + SELECT + id AS "id", + commune AS "commune", + code_postal AS "code_postal", + NULL AS "code_insee", + adresse AS "adresse", + NULL AS "complement_adresse", + _di_source_id AS "source", + CAST(NULL AS FLOAT) AS "longitude", + CAST(NULL AS FLOAT) AS "latitude" + FROM structures +), + +service_adresses AS ( SELECT - id AS "id", - ville AS "commune", - code_postal AS "code_postal", - NULL AS "code_insee", - adresse AS "adresse", - NULL AS "complement_adresse", - NULL::FLOAT AS "longitude", - NULL::FLOAT AS "latitude", - _di_source_id AS "source" - FROM rows_with_id + id AS "id", + commune AS "commune", + code_postal AS "code_postal", + NULL AS "code_insee", + adresse AS "adresse", + NULL AS "complement_adresse", + _di_source_id AS "source", + CAST(NULL AS FLOAT) AS "longitude", + CAST(NULL AS FLOAT) AS "latitude" + FROM services +), + +final AS ( + SELECT * FROM structure_adresses + UNION ALL + SELECT * FROM service_adresses ) SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql b/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql new file mode 100644 index 000000000..09443d0dc --- /dev/null +++ b/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql @@ -0,0 +1,44 @@ +WITH services AS ( + SELECT * FROM {{ ref('stg_cd72__services') }} +), + +final AS ( + SELECT + id AS "adresse_id", + TRUE AS "contact_public", + contact_nom_prenom AS "contact_nom_prenom", -- ignored for now + courriel AS "courriel", -- ignored for now + date_creation AS "date_creation", + date_maj AS "date_maj", + date_suspension AS "date_suspension", + NULL AS "formulaire_en_ligne", + frais_autres AS "frais_autres", + id AS "id", + NULL AS "lien_source", + NULL AS "modes_orientation_accompagnateur_autres", + modes_orientation_beneficiaire_autres AS "modes_orientation_beneficiaire_autres", + nom AS "nom", + presentation_resume AS "presentation_resume", + presentation_detail AS "presentation_detail", + NULL AS "prise_rdv", + profils AS "profils", + recurrence AS "recurrence", + _di_source_id AS "source", + structure_id AS "structure_id", + telephone AS "telephone", + thematiques AS "thematiques", + zone_diffusion_code AS "zone_diffusion_code", + NULL AS "zone_diffusion_nom", + zone_diffusion_type AS "zone_diffusion_type", + CAST(NULL AS BOOLEAN) AS "cumulable", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "modes_accueil", + CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", + CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", + CAST(NULL AS TEXT []) AS "types", + ARRAY[pre_requis] AS "pre_requis", + CAST(NULL AS TEXT []) AS "frais" + FROM services +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/cd72/int_cd72__structures.sql b/pipeline/dbt/models/intermediate/cd72/int_cd72__structures.sql index f790627a2..1e0eb123f 100644 --- a/pipeline/dbt/models/intermediate/cd72/int_cd72__structures.sql +++ b/pipeline/dbt/models/intermediate/cd72/int_cd72__structures.sql @@ -1,45 +1,33 @@ -WITH raw_rows AS ( - SELECT * FROM {{ ref('stg_cd72__rows') }} -), - -rows_with_id AS ( - SELECT * - FROM raw_rows - WHERE id IS NOT NULL +WITH structures AS ( + SELECT * FROM {{ ref('stg_cd72__structures') }} ), final AS ( SELECT - id AS "id", - id AS "adresse_id", - siret AS "siret", - NULL::BOOLEAN AS "antenne", - NULL AS "rna", - nom_structure AS "nom", - email_accueil AS "courriel", - site_internet AS "site_web", - _di_source_id AS "source", - NULL AS "lien_source", - horaires AS "horaires_ouverture", - NULL AS "accessibilite", - NULL::TEXT [] AS "labels_autres", - NULL::TEXT [] AS "thematiques", - NULL AS "typologie", - mise_a_jour_le::DATE AS "date_maj", - COALESCE(telephone_accueil, telephone_principal) AS "telephone", + NULL AS "accessibilite", + id AS "adresse_id", + courriel AS "courriel", + date_maj AS "date_maj", + horaires_ouverture AS "horaires_ouverture", + id AS "id", + NULL AS "lien_source", + nom AS "nom", + presentation_detail AS "presentation_detail", + NULL AS "presentation_resume", + NULL AS "rna", + siret AS "siret", + site_web AS "site_web", + _di_source_id AS "source", + telephone AS "telephone", + typologie AS "typologie", + CAST(NULL AS BOOLEAN) AS "antenne", + CAST(NULL AS TEXT []) AS "labels_autres", + CAST(NULL AS TEXT []) AS "thematiques", CASE - WHEN typologie_structure ~ 'AFPA' THEN ARRAY['afpa'] - WHEN typologie_structure ~ 'Mission Locale' THEN ARRAY['mission-locale'] - END AS "labels_nationaux", - CASE LENGTH(description) <= 280 - WHEN TRUE THEN description - WHEN FALSE THEN LEFT(description, 279) || '…' - END AS "presentation_resume", - CASE LENGTH(description) <= 280 - WHEN TRUE THEN NULL - WHEN FALSE THEN description - END AS "presentation_detail" - FROM rows_with_id + WHEN typologie = 'AFPA' THEN ARRAY['afpa'] + WHEN typologie = 'ML' THEN ARRAY['mission-locale'] + END AS "labels_nationaux" + FROM structures ) SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/int__union_services.sql b/pipeline/dbt/models/intermediate/int__union_services.sql index 9fc05d4b4..537324960 100644 --- a/pipeline/dbt/models/intermediate/int__union_services.sql +++ b/pipeline/dbt/models/intermediate/int__union_services.sql @@ -3,6 +3,7 @@ WITH services AS ( dbt_utils.union_relations( relations=[ ref('int_agefiph__services'), + ref('int_cd72__services'), ref('int_data_inclusion__services'), ref('int_dora__services'), ref('int_mediation_numerique__services'), diff --git a/pipeline/dbt/models/staging/cd72/_cd72__models.yml b/pipeline/dbt/models/staging/cd72/_cd72__models.yml index f14fcb3d8..001070c7d 100644 --- a/pipeline/dbt/models/staging/cd72/_cd72__models.yml +++ b/pipeline/dbt/models/staging/cd72/_cd72__models.yml @@ -1,6 +1,28 @@ version: 2 models: - - name: stg_cd72__rows + - name: stg_cd72__structures config: - tags: cd72 \ No newline at end of file + tags: cd72 + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + + - name: stg_cd72__services + config: + tags: cd72 + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('stg_cd72__structures') + field: id diff --git a/pipeline/dbt/models/staging/cd72/stg_cd72__rows.sql b/pipeline/dbt/models/staging/cd72/stg_cd72__rows.sql deleted file mode 100644 index c3a6dbf6a..000000000 --- a/pipeline/dbt/models/staging/cd72/stg_cd72__rows.sql +++ /dev/null @@ -1,26 +0,0 @@ -WITH source AS ( - SELECT * FROM {{ source('cd72', 'rows') }} -), - -final AS ( - SELECT - _di_source_id AS "_di_source_id", - data ->> 'ID Structure' AS "id", - data ->> 'ID Structure' AS "id_structure", - data ->> 'SIRET' AS "siret", - data ->> 'Nom Structure' AS "nom_structure", - data ->> 'Ville' AS "ville", - data ->> 'Code postal' AS "code_postal", - data ->> 'Adresse' AS "adresse", - data ->> 'Typologie Structure' AS "typologie_structure", - data ->> 'Téléphone accueil' AS "telephone_accueil", - data ->> 'Téléphone principal' AS "telephone_principal", - data ->> 'E-mail accueil' AS "email_accueil", - data ->> 'Site Internet' AS "site_internet", - data ->> 'Description' AS "description", - data ->> 'Mis à jour le :' AS "mise_a_jour_le", - data ->> 'Horaires' AS "horaires" - FROM source -) - -SELECT * FROM final diff --git a/pipeline/dbt/models/staging/cd72/stg_cd72__services.sql b/pipeline/dbt/models/staging/cd72/stg_cd72__services.sql new file mode 100644 index 000000000..4b68cf302 --- /dev/null +++ b/pipeline/dbt/models/staging/cd72/stg_cd72__services.sql @@ -0,0 +1,44 @@ +WITH source AS ( + SELECT * FROM {{ source('cd72', 'services') }} +), + +structures AS ( + SELECT * FROM {{ ref('stg_cd72__structures') }} +), + +final AS ( + SELECT + _di_source_id AS "_di_source_id", + data ->> 'id' AS "id", + data ->> 'nom' AS "nom", + data ->> 'lieu' AS "lieu", + data ->> 'siret' AS "siret", + -- TODO: frais, change column type from bool to ref list on grist + data ->> 'adresse' AS "adresse", + data ->> 'commune' AS "commune", + (SELECT ARRAY_AGG(TRIM(p)) FROM UNNEST(STRING_TO_ARRAY(data ->> 'profils', ',')) AS "p") AS "profils", + data ->> 'courriel' AS "courriel", + TO_DATE(data ->> 'date_maj', 'YYYY-MM-DD') AS "date_maj", + data ->> 'telephone' AS "telephone", + data ->> 'pre_requis' AS "pre_requis", + data ->> 'recurrence' AS "recurrence", + data ->> 'code_postal' AS "code_postal", + data ->> 'contact_nom_prenom' AS "contact_nom_prenom", + data ->> 'frais_autres' AS "frais_autres", + (SELECT ARRAY_AGG(TRIM(t)) FROM UNNEST(STRING_TO_ARRAY(data ->> 'thematiques', ',')) AS "t") AS "thematiques", + data ->> 'structure_id' AS "structure_id", + TO_DATE(data ->> 'date_creation', 'YYYY-MM-DD') AS "date_creation", + TO_DATE(data ->> 'date_suspension', 'YYYY-MM-DD') AS "date_suspension", + data ->> 'zone_diffusion_nom' AS "zone_diffusion_nom", + data ->> 'presentation_detail' AS "presentation_detail", + data ->> 'presentation_resume' AS "presentation_resume", + data ->> 'zone_diffusion_code' AS "zone_diffusion_code", + data ->> 'zone_diffusion_type' AS "zone_diffusion_type", + data ->> 'modes_orientation_beneficiaire_autres' AS "modes_orientation_beneficiaire_autres" + FROM source + WHERE + data ->> 'structure_id' IS NOT NULL + AND data ->> 'structure_id' IN (SELECT id FROM structures) +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/staging/cd72/stg_cd72__structures.sql b/pipeline/dbt/models/staging/cd72/stg_cd72__structures.sql new file mode 100644 index 000000000..b786b8427 --- /dev/null +++ b/pipeline/dbt/models/staging/cd72/stg_cd72__structures.sql @@ -0,0 +1,24 @@ +WITH source AS ( + SELECT * FROM {{ source('cd72', 'structures') }} +), + +final AS ( + SELECT + _di_source_id AS "_di_source_id", + data ->> 'id' AS "id", + data ->> 'nom' AS "nom", + data ->> 'siret' AS "siret", + data ->> 'adresse' AS "adresse", + data ->> 'commune' AS "commune", + data ->> 'courriel' AS "courriel", + CAST(data ->> 'date_maj' AS DATE) AS "date_maj", + data ->> 'site_web' AS "site_web", + data ->> 'telephone' AS "telephone", + data ->> 'typologie' AS "typologie", + data ->> 'code_postal' AS "code_postal", + data ->> 'horaires_ouverture' AS "horaires_ouverture", + data ->> 'presentation_detail' AS "presentation_detail" + FROM source +) + +SELECT * FROM final From da1e51995f6bd7a0e8abfa56ef90933ac7969d67 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Wed, 13 Sep 2023 13:15:48 +0200 Subject: [PATCH 28/34] chore(cd72): schedule once --- pipeline/dags/dags/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/dags/dags/settings.py b/pipeline/dags/dags/settings.py index 1648b3bce..dd2264700 100644 --- a/pipeline/dags/dags/settings.py +++ b/pipeline/dags/dags/settings.py @@ -247,7 +247,7 @@ }, { "id": "cd72", - "schedule_interval": "@daily", + "schedule_interval": "@once", "snapshot": False, "streams": [ { From 93ba39d54ff67fd2febc9381c5aaa1eb0e50e400 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 14 Sep 2023 09:43:19 +0200 Subject: [PATCH 29/34] chore(mednum): update extracted schema --- .../src/data_inclusion/scripts/tasks/mediation_numerique.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/src/data_inclusion/scripts/tasks/mediation_numerique.py b/pipeline/src/data_inclusion/scripts/tasks/mediation_numerique.py index 45a102f00..8a570c5de 100644 --- a/pipeline/src/data_inclusion/scripts/tasks/mediation_numerique.py +++ b/pipeline/src/data_inclusion/scripts/tasks/mediation_numerique.py @@ -13,7 +13,7 @@ def get_resources_url_from_dataset_url(dataset_url: str) -> dict[str, str]: data_inclusion_resources = [ resource_data for resource_data in dataset_data["resources"] - if resource_data["schema"]["name"] == "betagouv/data-inclusion-schema" + if resource_data["schema"]["name"] == "gip-inclusion/data-inclusion-schema" ] # identify urls based on resource titles From 66575038dac7650ea6809fa17e7cc913650898ed Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Thu, 14 Sep 2023 09:46:54 +0200 Subject: [PATCH 30/34] chore: update repo refs --- README.md | 2 +- api/src/data_inclusion/api/utils/code_officiel_geographique.py | 3 ++- pipeline/scripts/update_schema_seeds.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 63b295050..86d75a6e5 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Les données collectées sont: * enrichies via les outils développés par data·inclusion: * l'outil de correspondance, qui permet de faire correspondre 2 jeux de données brutes, * l'outil de sirétisation, qui permet d'attribuer un siret aux structures, afin de croiser, -* alignées sur le [schéma de données de data·inclusion](https://schema.data.gouv.fr/betagouv/data-inclusion-schema/) +* alignées sur le [schéma de données de data·inclusion](https://schema.data.gouv.fr/gip-inclusion/data-inclusion-schema/) * publiées régulièrement en [open data sur data.gouv](https://www.data.gouv.fr/fr/datasets/referentiel-de-loffre-dinsertion-liste-des-structures-et-services-dinsertion/), la plateforme de données publiques, * consultables via une api. diff --git a/api/src/data_inclusion/api/utils/code_officiel_geographique.py b/api/src/data_inclusion/api/utils/code_officiel_geographique.py index 3eaf03ed6..1564afa59 100644 --- a/api/src/data_inclusion/api/utils/code_officiel_geographique.py +++ b/api/src/data_inclusion/api/utils/code_officiel_geographique.py @@ -1,4 +1,5 @@ -# based on https://github.com/betagouv/dora-back/blob/main/dora/admin_express/utils.py +# based on +# https://github.com/gip-inclusion/dora-back/blob/main/dora/admin_express/utils.py CODES_ARRONDISSEMENTS_BY_CODE_COMMUNE = { # Paris diff --git a/pipeline/scripts/update_schema_seeds.py b/pipeline/scripts/update_schema_seeds.py index 89f96dcb0..5805f8bb6 100644 --- a/pipeline/scripts/update_schema_seeds.py +++ b/pipeline/scripts/update_schema_seeds.py @@ -7,7 +7,7 @@ BASE_URL = os.environ.get( "BASE_URL", - "https://raw.githubusercontent.com/betagouv/data-inclusion-schema/latest/schemas/extra/", # noqa: E501 + "https://raw.githubusercontent.com/gip-inclusion/data-inclusion-schema/latest/schemas/extra/", # noqa: E501 ) OUTPUT_DIR = Path(__file__).parent.parent / "dbt" / "seeds" / "schema" From 4d2fdd87e06e62f6e7b7fa3d5a10252fe4a6ae0c Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Mon, 18 Sep 2023 09:10:18 +0200 Subject: [PATCH 31/34] fix(cd72): pre_requis --- .../intermediate/cd72/int_cd72__services.sql | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql b/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql index 09443d0dc..f59032148 100644 --- a/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql +++ b/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql @@ -4,40 +4,40 @@ WITH services AS ( final AS ( SELECT - id AS "adresse_id", - TRUE AS "contact_public", - contact_nom_prenom AS "contact_nom_prenom", -- ignored for now - courriel AS "courriel", -- ignored for now - date_creation AS "date_creation", - date_maj AS "date_maj", - date_suspension AS "date_suspension", - NULL AS "formulaire_en_ligne", - frais_autres AS "frais_autres", - id AS "id", - NULL AS "lien_source", - NULL AS "modes_orientation_accompagnateur_autres", - modes_orientation_beneficiaire_autres AS "modes_orientation_beneficiaire_autres", - nom AS "nom", - presentation_resume AS "presentation_resume", - presentation_detail AS "presentation_detail", - NULL AS "prise_rdv", - profils AS "profils", - recurrence AS "recurrence", - _di_source_id AS "source", - structure_id AS "structure_id", - telephone AS "telephone", - thematiques AS "thematiques", - zone_diffusion_code AS "zone_diffusion_code", - NULL AS "zone_diffusion_nom", - zone_diffusion_type AS "zone_diffusion_type", - CAST(NULL AS BOOLEAN) AS "cumulable", - CAST(NULL AS TEXT []) AS "justificatifs", - CAST(NULL AS TEXT []) AS "modes_accueil", - CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", - CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", - CAST(NULL AS TEXT []) AS "types", - ARRAY[pre_requis] AS "pre_requis", - CAST(NULL AS TEXT []) AS "frais" + id AS "adresse_id", + TRUE AS "contact_public", + contact_nom_prenom AS "contact_nom_prenom", -- ignored for now + courriel AS "courriel", -- ignored for now + date_creation AS "date_creation", + date_maj AS "date_maj", + date_suspension AS "date_suspension", + NULL AS "formulaire_en_ligne", + frais_autres AS "frais_autres", + id AS "id", + NULL AS "lien_source", + NULL AS "modes_orientation_accompagnateur_autres", + modes_orientation_beneficiaire_autres AS "modes_orientation_beneficiaire_autres", + nom AS "nom", + presentation_resume AS "presentation_resume", + presentation_detail AS "presentation_detail", + NULL AS "prise_rdv", + profils AS "profils", + recurrence AS "recurrence", + _di_source_id AS "source", + structure_id AS "structure_id", + telephone AS "telephone", + thematiques AS "thematiques", + zone_diffusion_code AS "zone_diffusion_code", + NULL AS "zone_diffusion_nom", + zone_diffusion_type AS "zone_diffusion_type", + CAST(NULL AS BOOLEAN) AS "cumulable", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "modes_accueil", + CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", + CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", + CAST(NULL AS TEXT []) AS "types", + CASE WHEN pre_requis IS NOT NULL THEN ARRAY[pre_requis] END AS "pre_requis", + CAST(NULL AS TEXT []) AS "frais" FROM services ) From 9d4bbb598c27f1356bc24d483301bd16d3de9682 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Tue, 19 Sep 2023 11:05:55 +0200 Subject: [PATCH 32/34] docs: update pip-compile --- api/CONTRIBUTING.md | 12 ++++++------ pipeline/CONTRIBUTING.md | 16 +++++++++------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/api/CONTRIBUTING.md b/api/CONTRIBUTING.md index 248c738d4..169c5596a 100644 --- a/api/CONTRIBUTING.md +++ b/api/CONTRIBUTING.md @@ -45,16 +45,16 @@ tox # 1. add/remove packages from the requirements in setup.py # 2. compile dependencies -pip-compile --resolver=backtracking --output-file=requirements/requirements.txt -pip-compile --resolver=backtracking --extra=dev --output-file=requirements/dev-requirements.txt -pip-compile --resolver=backtracking --extra=test --output-file=requirements/test-requirements.txt +pip-compile --output-file=requirements/requirements.txt +pip-compile --extra=dev --output-file=requirements/dev-requirements.txt +pip-compile --extra=test --output-file=requirements/test-requirements.txt ``` ### 2. Upgrading packages ```bash # 1. compile dependencies with the upgrade flag set -pip-compile --resolver=backtracking --upgrade --output-file=requirements/requirements.txt && \ - pip-compile --resolver=backtracking --upgrade --extra=dev --output-file=requirements/dev-requirements.txt && \ - pip-compile --resolver=backtracking --upgrade --extra=test --output-file=requirements/test-requirements.txt +pip-compile --upgrade --output-file=requirements/requirements.txt && \ + pip-compile --upgrade --extra=dev --output-file=requirements/dev-requirements.txt && \ + pip-compile --upgrade --extra=test --output-file=requirements/test-requirements.txt ``` \ No newline at end of file diff --git a/pipeline/CONTRIBUTING.md b/pipeline/CONTRIBUTING.md index c6f893946..a4f0f5b22 100644 --- a/pipeline/CONTRIBUTING.md +++ b/pipeline/CONTRIBUTING.md @@ -67,6 +67,8 @@ python scripts/update_schema_seeds.py ## Project requirements +* `pip-compile~=7.3` + ### airflow These requirements are mainly used for the deployment on scalingo. @@ -83,7 +85,7 @@ To update the constraints and upgrade the requirements: AIRFLOW_VERSION= PYTHON_VERSION=3.10 curl https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt > requirements/airflow/constraints.txt -pip-compile --resolver=backtracking --upgrade requirements/airflow/requirements.in --output-file requirements/airflow/requirements.txt +pip-compile --upgrade requirements/airflow/requirements.in ``` ### tasks @@ -99,15 +101,15 @@ To add or delete a dependency to these requirements: ```bash # 1. edit the target requirements/tasks/...../requirements.in # 2. compile the dependencies -pip-compile --resolver=backtracking requirements/tasks/dbt/requirements.in --output-file requirements/tasks/dbt/requirements.txt -pip-compile --resolver=backtracking requirements/tasks/python/requirements.in --output-file requirements/tasks/python/requirements.txt +pip-compile requirements/tasks/dbt/requirements.in +pip-compile requirements/tasks/python/requirements.in ``` To upgrade these requirements: ```bash -pip-compile --resolver=backtracking --upgrade requirements/tasks/dbt/requirements.in --output-file requirements/tasks/dbt/requirements.txt -pip-compile --resolver=backtracking --upgrade requirements/tasks/python/requirements.in --output-file requirements/tasks/python/requirements.txt +pip-compile --upgrade requirements/tasks/dbt/requirements.in +pip-compile --upgrade requirements/tasks/python/requirements.in ``` Then you should update the dev requirements. @@ -122,11 +124,11 @@ To add or delete a dependency to these dev requirements: ```bash # 1. edit the target requirements/dev/requirements.in # 2. compile the dependencies -pip-compile --resolver=backtracking requirements/dev/requirements.in --output-file requirements/dev/requirements.txt +pip-compile requirements/dev/requirements.in ``` To upgrade these requirements: ```bash -pip-compile --resolver=backtracking --upgrade requirements/dev/requirements.in --output-file requirements/dev/requirements.txt +pip-compile --upgrade requirements/dev/requirements.in ``` \ No newline at end of file From 427131c4400cd727c2854d14f5fdcc0e943f401d Mon Sep 17 00:00:00 2001 From: Colin Maudry Date: Tue, 26 Sep 2023 17:13:15 +0200 Subject: [PATCH 33/34] =?UTF-8?q?feat(pipeline):=20r=C3=A9seau=20alpha=20(?= =?UTF-8?q?#115)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .template.env | 1 + analyse/.template.env | 3 +- analyse/notebooks/reseau-alpha/.gitignore | 3 + analyse/notebooks/reseau-alpha/extract.ipynb | 331 ++++++++++++++++++ analyse/requirements.in | 3 + analyse/requirements.txt | 169 +++++++-- docker-compose.yml | 1 + pipeline/dags/dags/settings.py | 17 + pipeline/dags/import_sources.py | 12 + pipeline/dbt/models/_sources.yml | 8 +- .../intermediate/int__union_adresses.sql | 1 + .../intermediate/int__union_services.sql | 1 + .../int__union_services__enhanced.sql | 14 +- .../intermediate/int__union_structures.sql | 1 + .../reseau_alpha/_reseau_alpha__models.yml | 56 +++ .../int_reseau_alpha__adresses.sql | 43 +++ .../int_reseau_alpha__services.sql | 64 ++++ .../int_reseau_alpha__structures.sql | 30 ++ .../reseau_alpha/_reseau_alpha__models.yml | 24 ++ .../stg_reseau_alpha__formations.sql | 71 ++++ .../stg_reseau_alpha__structures.sql | 67 ++++ pipeline/requirements/dev/requirements.txt | 26 +- .../requirements/tasks/python/requirements.in | 1 + .../tasks/python/requirements.txt | 20 +- .../scripts/tasks/reseau_alpha.py | 205 +++++++++++ 25 files changed, 1123 insertions(+), 49 deletions(-) create mode 100644 analyse/notebooks/reseau-alpha/.gitignore create mode 100644 analyse/notebooks/reseau-alpha/extract.ipynb create mode 100644 pipeline/dbt/models/intermediate/reseau_alpha/_reseau_alpha__models.yml create mode 100644 pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql create mode 100644 pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql create mode 100644 pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__structures.sql create mode 100644 pipeline/dbt/models/staging/reseau_alpha/_reseau_alpha__models.yml create mode 100644 pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql create mode 100644 pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__structures.sql create mode 100644 pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py diff --git a/.template.env b/.template.env index b55c2f623..9bd28547d 100644 --- a/.template.env +++ b/.template.env @@ -89,6 +89,7 @@ MES_AIDES_AIRTABLE_KEY= MES_AIDES_GARAGES_URL=https://airtable.com/appEvva5gyqqoQRnr/tblnGf4Y5EUEeVHtJ/viw9ZZAUkexq6uDaI MONENFANT_CRECHES_FILE_URL= ODSPEP_S3_KEY_PREFIX=sources/odspep/2023-01-23/denormalized/Exports/ +RESEAU_ALPHA_URL=https://www.reseau-alpha.org SIAO_FILE_URL= SIRENE_STOCK_ETAB_GEOCODE_FILE_URL=https://data.cquest.org/geo_sirene/v2019/last/StockEtablissementActif_utf8_geo.csv.gz SIRENE_STOCK_ETAB_HIST_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/88fbb6b4-0320-443e-b739-b4376a012c32 diff --git a/analyse/.template.env b/analyse/.template.env index bb5187f29..bdd2044a1 100644 --- a/analyse/.template.env +++ b/analyse/.template.env @@ -8,4 +8,5 @@ FINESS_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/3dc9b1d5-0157-440d-a7b5-c CD72_FILE_URL= CD93_FILE_URL= CD35_FILE_URL=https://data.ille-et-vilaine.fr/dataset/8d5ec0f0-ebe1-442d-9d99-655b37d5ad07/resource/665776ae-fa25-46ab-9bfd-c4241866f03f/download/annuaire_sociale_fixe.csv -CD62_FILE_URL= \ No newline at end of file +CD62_FILE_URL= +RESEAU_ALPHA_TEST_W_LOCAL_FILES=0 \ No newline at end of file diff --git a/analyse/notebooks/reseau-alpha/.gitignore b/analyse/notebooks/reseau-alpha/.gitignore new file mode 100644 index 000000000..d1452f15d --- /dev/null +++ b/analyse/notebooks/reseau-alpha/.gitignore @@ -0,0 +1,3 @@ +structures +services +*.html \ No newline at end of file diff --git a/analyse/notebooks/reseau-alpha/extract.ipynb b/analyse/notebooks/reseau-alpha/extract.ipynb new file mode 100644 index 000000000..07722a2c9 --- /dev/null +++ b/analyse/notebooks/reseau-alpha/extract.ipynb @@ -0,0 +1,331 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scraping des structures et services publiés sur le site Web de Réseau alpha\n", + "\n", + "Le scraping commence sur cette page pour l'Essonne : https://www.reseau-alpha.org/trouver-une-formation?form%5BcodePostal%5D%5B%5D=%7C91&form%5BcriteresScolarisation%5D=&form%5BniveauLinguistiqueVise%5D=&form%5Bprogramme%5D=&form%5BmotCle%5D=\n", + "\n", + "Cette page est générée dynamiquement et Scrapy ne peut donc pas en extraire le contenu. Le HTML doit donc être extrait à la main et sauvegardé dans le même dossier que ce notebook sous le nom `structure_list.html`.\n", + "\n", + "Le script permet de scraper une copie locale du HTML pour les formations et les structures. C'est utile pour tester le script sans envoyer de requêtes au site Web original. Pour ce faire :\n", + "\n", + "1. Faire tourner au moins une fois le scrap avec RESEAU_ALPHA_TEST_W_LOCAL_FILES=0 pour télécharger le HTML depuis le site live sur l'ordinateur dans les dossiers `./structures` et `./services`\n", + "2. Set RESEAU_ALPHA_TEST_W_LOCAL_FILES=1\n", + "\n", + "### Structure du script\n", + "\n", + "1. `start_requests` démarre le scraping à partir de la page de résultats de rechercher\n", + "2. `parse` parse cette page pour extraire la liste des formations (pas encore les permanences)\n", + "3. `parse_formation` scrape le contenu de la page de chaque formation et passe le dictionnaire item à la fonction suivante\n", + "4. `parse_structure` scrape la page de la structure liée à la formation en enrichissant le dictionnaire item. Cette fonction est appelée autant de fois qu'il y a de lieux pour la formation\n", + "5. à la fin de `parse_structure`, le dictionnaire item est \"yield\" pour former une ligne du CSV (ou un objet dans l'array JSON)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import scrapy\n", + "from scrapy.crawler import CrawlerProcess\n", + "from pathlib import Path\n", + "from urllib.parse import urlparse \n", + "import re\n", + "import dateparser\n", + "import os\n", + "import dotenv\n", + "import trafilatura\n", + "\n", + "dotenv.load_dotenv(dotenv.find_dotenv())\n", + "TESTING_WITH_LOCAL_FILES = os.getenv(\"ENV_VAR\", 'False').lower() in ('true', '1', 't')\n", + "\n", + "# Local HTML\n", + "base_path = 'file://' + os.path.abspath('')\n", + "structure_base_path = base_path + '/structures'\n", + "formation_base_path = base_path + '/services'\n", + "\n", + "\n", + "\n", + "URL = f\"{base_path}/structure_list.html\"\n", + "if TESTING_WITH_LOCAL_FILES is False:\n", + " os.makedirs(structure_base_path, exist_ok=True)\n", + " os.makedirs(formation_base_path, exist_ok=True)\n", + "\n", + "# Live HTML (don't use too much to avoid being banned!)\n", + "# structure_base_url = 'https://www.reseau-alpha.org/structure/apprentissage-du-francais/'\n", + "\n", + "\n", + "# Structure avec antennes et formations : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries\n", + "# Structure sans antenne et sans formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/acafi\n", + "# Formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries/formation/francais-a-visee-professionnelle/b8a73-francais-a-visee-sociale-et-ou-professionnelle\n", + "\n", + "def html_to_markdown(s: str):\n", + " if s is None or s == \"\" :\n", + " return s\n", + " if type(s) == list:\n", + " s = \"
\".join(s)\n", + " return trafilatura.extract(trafilatura.load_html(\"\" + s + \"\"), no_fallback=True, max_tree_size=1000)\n", + "\n", + "def clean_adresse(adresses: list or scrapy.Selector) -> {} or []:\n", + " lieux = []\n", + " for adresse in adresses:\n", + " adresse_text_chunks = adresse.xpath('text()').getall()\n", + " clean_lieu = {\n", + " \"structure_service_adresse_entiere\": \"\",\n", + " \"structure_service_adresse\": \"\",\n", + " \"structure_service_code_postal\": \"\",\n", + " \"structure_service_commune\": \"\"\n", + " }\n", + " for part in adresse_text_chunks:\n", + " part = part.strip()\n", + " if re.match(r'^\\d', part):\n", + " if re.match(r'^\\d{5}', part):\n", + " split_address = part.split(\" - \")\n", + " clean_lieu[\"structure_service_code_postal\"] = split_address[0]\n", + " clean_lieu[\"structure_service_commune\"] = split_address[1]\n", + " else:\n", + " clean_lieu[\"structure_service_adresse\"] = part\n", + " clean_lieu[\"structure_service_adresse_entiere\"] += part + \", \"\n", + " lieux.append(clean_lieu)\n", + " return lieux\n", + "\n", + "def strip(maybe_string):\n", + " if type(maybe_string) == str:\n", + " return maybe_string.strip()\n", + " if maybe_string == None:\n", + " return \"\"\n", + " else:\n", + " return maybe_string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class AlphaSpider(scrapy.Spider):\n", + " name = \"alpha\"\n", + " custom_settings = {\n", + " \"DOWNLOAD_DELAY\": 0 if TESTING_WITH_LOCAL_FILES else 0.5\n", + " }\n", + "\n", + " def start_requests(self):\n", + " urls = [\n", + " URL\n", + " ]\n", + " for url in urls:\n", + " yield scrapy.Request(url=url, callback=self.parse)\n", + "\n", + " def parse(self, response):\n", + " \n", + " formations_links = response.css('div#div-accordion-formation > div.contact-content a.readon')\n", + " \n", + " if TESTING_WITH_LOCAL_FILES:\n", + " for slug in formations_links.xpath('@href').getall():\n", + " next_page = f\"{formation_base_path}/{slug.split('/')[-1]}\"\n", + " yield scrapy.Request(next_page, callback=self.parse_formation)\n", + " else:\n", + " for a in formations_links:\n", + " yield response.follow(a, callback=self.parse_formation)\n", + "\n", + "\n", + " def parse_formation(self, response):\n", + "\n", + " if TESTING_WITH_LOCAL_FILES is False:\n", + " # Downloading HTML content\n", + " page = response.url.split(\"/\")[-1]\n", + " # Path doesn't deal with file:// URIs\n", + " filepath = Path(formation_base_path[7:]) / page\n", + " filepath.write_bytes(response.body)\n", + "\n", + " formation_entete = response.css('div.entete')\n", + " formation_contenu = response.css('div.entete + div')\n", + " formation_contenu_col1 = response.css('div.entete + div > div:nth-child(1)')\n", + " formation_contenu_col2 = response.css('div.entete + div > div:nth-child(2)')\n", + " formation_inscription_info = formation_contenu_col2.css('div:nth-of-type(1)')\n", + " formation_inscription_contact = formation_contenu_col2.css('div:nth-of-type(2)')\n", + " formation_informations_pratiques = formation_contenu_col2.css('div:nth-of-type(3)')\n", + " formation_lieux_horaires = response.css('div#lieux-formation')\n", + "\n", + "\n", + " # SERVICE\n", + " item = {}\n", + "\n", + " # Nom\n", + " service_nom_1 = strip(response.css(\"div.titre-element > strong::text\").get())\n", + " service_nom_2 = strip(response.css(\"a.underline.red-alpha + div::text\").get())\n", + " item[\"nom\"] = f\"{service_nom_1} ({service_nom_2})\"\n", + "\n", + " # Date de màj\n", + " date_maj_fr = strip(response.css(\"a.underline.red-alpha + div + div::text\").get().split(\":\")[-1])\n", + " item[\"date_maj\"] = dateparser.parse(date_maj_fr).isoformat()\n", + " \n", + " # Description\n", + " contenu_objectif_public = formation_contenu_col1.css(\".row div\").getall()\n", + " contenu_objectif_public += formation_informations_pratiques.get()\n", + " # les descriptions sont très longues et rendent difficiles le test des autres champs\n", + " # item[\"presentation_detail\"] = html_to_markdown(contenu_objectif_public)\n", + "\n", + " # Lien vers la source\n", + " item[\"lien_source\"] = response.url\n", + "\n", + " # Courriel\n", + " item[\"courriel\"] = strip(formation_inscription_contact.css('div.email.red-alpha > a::attr(href)').get()).split(\":\")[-1]\n", + "\n", + " # Adresse\n", + " clean_lieux = clean_adresse(formation_lieux_horaires.css(\"div.adresse\"))\n", + "\n", + " # Téléphone\n", + " item[\"telephone\"] = \"\"\n", + " \n", + " # Contact nom prénom\n", + " item[\"contact_nom_prenom\"] = \"\"\n", + "\n", + " # Thématiques\n", + " item[\"thematiques\"] = [\"apprendre-francais--suivre-formation\"]\n", + " if service_nom_2 == \"Français à visée professionnelle\":\n", + " item[\"thematiques\"].append(\"apprendre-francais--accompagnement-insertion-pro\")\n", + " if service_nom_2 == \"Français à visée sociale et communicative\":\n", + " item[\"thematiques\"].append(\"apprendre-francais--communiquer-vie-tous-les-jours\")\n", + "\n", + " # Hard coded fields\n", + " item[\"zone_diffusion_type\"] = \"departement\"\n", + " item[\"zone_diffusion_code\"] = \"91\"\n", + " item[\"zone_diffusion_nom\"] = \"Essonne\"\n", + " item[\"types\"] = [\"formation\"]\n", + " item[\"cumulable\"] = True\n", + " item[\"contact_public\"] = True\n", + " item[\"modes_accueil\"] = [\"en-presentiel\"]\n", + "\n", + " \n", + " # STRUCTURE\n", + " # ID de la structure\n", + " structure_link_element = formation_entete.css(\"div.titre-element ~ a.underline.red-alpha\")\n", + " item[\"structure_id\"] = structure_link_element.xpath(\"@href\").get().split(\"/\")[-1]\n", + " if TESTING_WITH_LOCAL_FILES:\n", + " structure_link = f\"{structure_base_path}/{item['structure_id']}\"\n", + " else:\n", + " structure_link = structure_link_element.xpath(\"@href\").get()\n", + " \n", + " \n", + "\n", + " # Une ligne/record de service et une structure par lieu\n", + " service_id_suffix = 1\n", + " for lieu in clean_lieux:\n", + " # Id\n", + " item[\"id\"] = f\"{response.url.split('/')[-1]}_{str(service_id_suffix)}\"\n", + " service_id_suffix += 1\n", + " print(lieu)\n", + " item = item | lieu\n", + " yield scrapy.Request(structure_link, callback=self.parse_structure, meta={\"item\": item}, dont_filter=True)\n", + " \n", + " def parse_structure(self, response):\n", + " if TESTING_WITH_LOCAL_FILES is False:\n", + " # Downloading HTML content\n", + " page = response.url.split(\"/\")[-1]\n", + " # Path doesn't deal with file:// URIs\n", + " filepath = Path(structure_base_path[7:]) / page\n", + " filepath.write_bytes(response.body)\n", + "\n", + " item = response.meta.get(\"item\")\n", + " \n", + "\n", + " # Nom\n", + " item[\"structure_nom\"] = strip(response.css('div#structure > strong::text').get())\n", + "\n", + " # Data màj\n", + " item[\"structure_date_maj\"] = strip(response.css('div.structures-dates > div:nth-child(2)').xpath('text()').get())\n", + " item[\"structure_date_maj\"] = item[\"structure_date_maj\"].split(\" : \")[-1]\n", + " item[\"structure_date_maj\"] = dateparser.parse(item[\"structure_date_maj\"]).isoformat()\n", + "\n", + " # Adresse\n", + " # Sur le site Web, une structure a autant d'adresses qu'elle a de lieux pour ses services\n", + " # Certains services sont proposés sur toutes les adresses de la structure, certains non.\n", + "\n", + " # Téléphone\n", + " telephone = response.css('div.lieu div.telephone > a::attr(href)').get()\n", + " if type(telephone) == str:\n", + " # Les numéro de téléphone sont préfixés par tel:\n", + " telephone = telephone.strip()[4:]\n", + " else:\n", + " telephone = \"\"\n", + " item[\"structure_telephone\"] = telephone\n", + " \n", + " # Site Web\n", + " item[\"structure_site_web\"] = strip(response.css('div.lieu div.facebook::text').get())\n", + "\n", + " # Lien source\n", + " item[\"structure_lien_source\"] = response.url\n", + "\n", + " # Labels\n", + " item[\"structure_labels_autres\"] = [\"reseau-alpha\"]\n", + "\n", + " # Thématiques\n", + " item[\"structure_thematiques\"] = [\"apprendre-francais--suivre-formation\"]\n", + "\n", + "\n", + " yield item\n", + "\n", + " \n", + "process = CrawlerProcess(settings={\n", + " \"FEEDS\": {\n", + " # Seul le JSON est utilisable dans le pipeline car le CSV imprime les listes sans square brackets ([])\n", + " # Le CSV est pratique pour tester\n", + " \"alpha.json\": {\n", + " \"format\": \"json\",\n", + " \"overwrite\": True,\n", + " \"ensure_ascii\": False,\n", + " 'encoding': 'utf8',\n", + " 'store_empty': False,\n", + " },\n", + " \"alpha.csv\": {\n", + " \"format\": \"csv\",\n", + " \"overwrite\": True,\n", + " 'encoding': 'utf8',\n", + " },\n", + " },\n", + "})\n", + "process.crawl(AlphaSpider)\n", + "process.start()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv('./alpha.csv', dtype = str, index_col=None)\n", + "df.info()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv-analyse", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analyse/requirements.in b/analyse/requirements.in index 2bf5e3f63..c40fdaa5f 100644 --- a/analyse/requirements.in +++ b/analyse/requirements.in @@ -12,3 +12,6 @@ seaborn pyairtable pyproj minio +scrapy +dateparser +trafilatura \ No newline at end of file diff --git a/analyse/requirements.txt b/analyse/requirements.txt index 676867f34..59dc4322d 100644 --- a/analyse/requirements.txt +++ b/analyse/requirements.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with python 3.10 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: # -# pip-compile +# pip-compile --output-file=analyse/requirements.txt analyse/requirements.in # anyio==3.6.2 # via jupyter-server @@ -18,13 +18,19 @@ arrow==1.2.3 asttokens==2.2.1 # via stack-data attrs==22.2.0 - # via jsonschema + # via + # automat + # jsonschema + # service-identity + # twisted +automat==22.10.0 + # via twisted backcall==0.2.0 # via ipython beautifulsoup4==4.11.2 # via nbconvert black[jupyter]==23.1.0 - # via -r requirements.in + # via -r analyse/requirements.in bleach==6.0.0 # via nbconvert certifi==2022.12.7 @@ -32,18 +38,41 @@ certifi==2022.12.7 # minio # pyproj # requests + # trafilatura cffi==1.15.1 - # via argon2-cffi-bindings + # via + # argon2-cffi-bindings + # cryptography charset-normalizer==3.0.1 - # via requests + # via + # htmldate + # requests + # trafilatura click==8.1.3 # via black comm==0.1.2 # via ipykernel +constantly==15.1.0 + # via twisted contourpy==1.0.7 # via matplotlib +courlan==0.9.3 + # via trafilatura +cryptography==41.0.3 + # via + # pyopenssl + # scrapy + # service-identity +cssselect==1.2.0 + # via + # parsel + # scrapy cycler==0.11.0 # via matplotlib +dateparser==1.1.8 + # via + # -r analyse/requirements.in + # htmldate debugpy==1.6.6 # via ipykernel decorator==5.1.1 @@ -56,17 +85,27 @@ executing==1.2.0 # via stack-data fastjsonschema==2.16.2 # via nbformat +filelock==3.12.2 + # via tldextract fonttools==4.38.0 # via matplotlib fqdn==1.5.1 # via jsonschema greenlet==2.0.2 # via sqlalchemy +htmldate==1.4.1 + # via trafilatura +hyperlink==21.0.0 + # via twisted idna==3.4 # via # anyio + # hyperlink # jsonschema # requests + # tldextract +incremental==22.10.0 + # via twisted ipykernel==6.21.2 # via # nbclassic @@ -81,15 +120,25 @@ ipython-genutils==0.2.0 # notebook isoduration==20.11.0 # via jsonschema +itemadapter==0.8.0 + # via + # itemloaders + # scrapy +itemloaders==1.1.0 + # via scrapy jedi==0.18.2 # via ipython jinja2==3.1.2 # via - # -r requirements.in + # -r analyse/requirements.in # jupyter-server # nbclassic # nbconvert # notebook +jmespath==1.0.1 + # via + # itemloaders + # parsel jsonpointer==2.3 # via jsonschema jsonschema[format-nongpl]==4.17.3 @@ -123,22 +172,33 @@ jupyter-server-terminals==0.4.4 # via jupyter-server jupyterlab-pygments==0.2.2 # via nbconvert +justext==3.0.0 + # via trafilatura kiwisolver==1.4.4 # via matplotlib +langcodes==3.3.0 + # via courlan +lxml==4.9.3 + # via + # htmldate + # justext + # parsel + # scrapy + # trafilatura markupsafe==2.1.2 # via # jinja2 # nbconvert matplotlib==3.7.0 # via - # -r requirements.in + # -r analyse/requirements.in # seaborn matplotlib-inline==0.1.6 # via # ipykernel # ipython minio==7.1.13 - # via -r requirements.in + # via -r analyse/requirements.in mistune==2.0.5 # via nbconvert mypy-extensions==1.0.0 @@ -165,7 +225,7 @@ nest-asyncio==1.5.6 # nbclassic # notebook notebook==6.5.2 - # via -r requirements.in + # via -r analyse/requirements.in notebook-shim==0.2.2 # via nbclassic numpy==1.24.2 @@ -175,7 +235,7 @@ numpy==1.24.2 # pandas # seaborn openpyxl==3.1.1 - # via -r requirements.in + # via -r analyse/requirements.in packaging==23.0 # via # black @@ -183,12 +243,18 @@ packaging==23.0 # jupyter-server # matplotlib # nbconvert + # parsel + # scrapy pandas==1.5.3 # via - # -r requirements.in + # -r analyse/requirements.in # seaborn pandocfilters==1.5.0 # via nbconvert +parsel==1.8.1 + # via + # itemloaders + # scrapy parso==0.8.3 # via jedi pathspec==0.11.0 @@ -210,10 +276,12 @@ prometheus-client==0.16.0 # notebook prompt-toolkit==3.0.37 # via ipython +protego==0.2.1 + # via scrapy psutil==5.9.4 # via ipykernel psycopg2==2.9.5 - # via -r requirements.in + # via -r analyse/requirements.in ptyprocess==0.7.0 # via # pexpect @@ -221,30 +289,45 @@ ptyprocess==0.7.0 pure-eval==0.2.2 # via stack-data pyairtable==1.4.0 - # via -r requirements.in + # via -r analyse/requirements.in +pyasn1==0.5.0 + # via + # pyasn1-modules + # service-identity +pyasn1-modules==0.3.0 + # via service-identity pycparser==2.21 # via cffi +pydispatcher==2.0.7 + # via scrapy pygments==2.14.0 # via # ipython # nbconvert +pyopenssl==23.2.0 + # via scrapy pyparsing==3.0.9 # via matplotlib pyproj==3.4.1 - # via -r requirements.in + # via -r analyse/requirements.in pyrsistent==0.19.3 # via jsonschema python-dateutil==2.8.2 # via + # arrow + # dateparser + # htmldate # jupyter-client # matplotlib # pandas python-dotenv==0.21.1 - # via -r requirements.in + # via -r analyse/requirements.in python-json-logger==2.0.7 # via jupyter-events pytz==2022.7.1 - # via pandas + # via + # dateparser + # pandas pyyaml==6.0 # via jupyter-events pyzmq==25.0.0 @@ -254,10 +337,18 @@ pyzmq==25.0.0 # jupyter-server # nbclassic # notebook +queuelib==1.6.2 + # via scrapy +regex==2023.8.8 + # via dateparser requests==2.28.2 # via - # -r requirements.in + # -r analyse/requirements.in # pyairtable + # requests-file + # tldextract +requests-file==1.5.1 + # via tldextract rfc3339-validator==0.1.4 # via # jsonschema @@ -266,25 +357,32 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events +scrapy==2.10.0 + # via -r analyse/requirements.in seaborn==0.12.2 - # via -r requirements.in + # via -r analyse/requirements.in send2trash==1.8.0 # via # jupyter-server # nbclassic # notebook +service-identity==23.1.0 + # via scrapy six==1.16.0 # via # asttokens + # automat # bleach + # protego # python-dateutil + # requests-file # rfc3339-validator sniffio==1.3.0 # via anyio soupsieve==2.4 # via beautifulsoup4 sqlalchemy==2.0.4 - # via -r requirements.in + # via -r analyse/requirements.in stack-data==0.6.2 # via ipython terminado==0.17.1 @@ -295,6 +393,10 @@ terminado==0.17.1 # notebook tinycss2==1.2.1 # via nbconvert +tld==0.13 + # via courlan +tldextract==3.4.4 + # via scrapy tokenize-rt==5.0.0 # via black tomli==2.0.1 @@ -307,6 +409,8 @@ tornado==6.2 # nbclassic # notebook # terminado +trafilatura==1.4.1 + # via -r analyse/requirements.in traitlets==5.9.0 # via # comm @@ -322,14 +426,28 @@ traitlets==5.9.0 # nbconvert # nbformat # notebook +twisted==22.10.0 + # via scrapy typing-extensions==4.5.0 - # via sqlalchemy + # via + # sqlalchemy + # twisted +tzlocal==5.0.1 + # via dateparser uri-template==1.2.0 # via jsonschema urllib3==1.26.14 # via + # courlan + # htmldate # minio # requests + # trafilatura +w3lib==2.1.2 + # via + # itemloaders + # parsel + # scrapy wcwidth==0.2.6 # via prompt-toolkit webcolors==1.12 @@ -340,3 +458,10 @@ webencodings==0.5.1 # tinycss2 websocket-client==1.5.1 # via jupyter-server +zope-interface==6.0 + # via + # scrapy + # twisted + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/docker-compose.yml b/docker-compose.yml index 14050c125..8340683a7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -86,6 +86,7 @@ x-airflow-common: AIRFLOW_VAR_SOLIGUIDE_API_TOKEN: ${SOLIGUIDE_API_TOKEN} AIRFLOW_VAR_SOLIGUIDE_API_URL: ${SOLIGUIDE_API_URL} AIRFLOW_VAR_UN_JEUNE_UNE_SOLUTION_API_URL: ${UN_JEUNE_UNE_SOLUTION_API_URL} + AIRFLOW_VAR_RESEAU_ALPHA_URL: ${RESEAU_ALPHA_URL} # make the data_inclusion package available in editable mode PYTHONPATH: $${PYTHONPATH}:/opt/airflow/data-inclusion/src diff --git a/pipeline/dags/dags/settings.py b/pipeline/dags/dags/settings.py index dd2264700..dbabc1218 100644 --- a/pipeline/dags/dags/settings.py +++ b/pipeline/dags/dags/settings.py @@ -211,6 +211,23 @@ }, ], }, + { + "id": "reseau-alpha", + "schedule_interval": "@once", + "snapshot": False, + "streams": [ + { + "id": "structures", + "filename": "structures.tar.gz", + "url": Variable.get("RESEAU_ALPHA_URL", None), + }, + { + "id": "formations", + "filename": "formations.tar.gz", + "url": Variable.get("RESEAU_ALPHA_URL", None), + }, + ], + }, { "id": "agefiph", "schedule_interval": "@daily", diff --git a/pipeline/dags/import_sources.py b/pipeline/dags/import_sources.py index 036773ac5..1d8e181b0 100644 --- a/pipeline/dags/import_sources.py +++ b/pipeline/dags/import_sources.py @@ -61,6 +61,7 @@ def _extract( grist, mediation_numerique, mes_aides, + reseau_alpha, soliguide, utils, ) @@ -83,10 +84,16 @@ def _extract( "un-jeune-une-solution": utils.extract_http_content, "soliguide": soliguide.extract, "monenfant": utils.extract_http_content, + "reseau-alpha": { + "structures": reseau_alpha.extract_structures, + "formations": reseau_alpha.extract_formations, + }, } if source_config["id"].startswith("mediation-numerique-"): extract_fn = mediation_numerique.extract + elif isinstance(EXTRACT_FN_BY_SOURCE_ID[source_config["id"]], dict): + extract_fn = EXTRACT_FN_BY_SOURCE_ID[source_config["id"]][stream_config["id"]] else: extract_fn = EXTRACT_FN_BY_SOURCE_ID[source_config["id"]] @@ -132,6 +139,7 @@ def _load( agefiph, annuaire_du_service_public, monenfant, + reseau_alpha, soliguide, utils, ) @@ -149,6 +157,10 @@ def _load( "un-jeune-une-solution": utils.read_json, "soliguide": soliguide.read, "monenfant": monenfant.read, + "reseau-alpha": { + "structures": reseau_alpha.read_structures, + "formations": reseau_alpha.read_formations, + }, "agefiph": { "services": agefiph.read, "structures": lambda path: utils.read_csv(path, sep=","), diff --git a/pipeline/dbt/models/_sources.yml b/pipeline/dbt/models/_sources.yml index ffae9f7ef..0a47d78bc 100644 --- a/pipeline/dbt/models/_sources.yml +++ b/pipeline/dbt/models/_sources.yml @@ -474,4 +474,10 @@ sources: schema: agefiph tables: - name: services - - name: structures \ No newline at end of file + - name: structures + + - name: reseau_alpha + schema: reseau_alpha + tables: + - name: structures + - name: formations \ No newline at end of file diff --git a/pipeline/dbt/models/intermediate/int__union_adresses.sql b/pipeline/dbt/models/intermediate/int__union_adresses.sql index 2faf5f999..13d499d00 100644 --- a/pipeline/dbt/models/intermediate/int__union_adresses.sql +++ b/pipeline/dbt/models/intermediate/int__union_adresses.sql @@ -14,6 +14,7 @@ WITH adresses AS ( ref('int_mes_aides__adresses'), ref('int_monenfant__adresses'), ref('int_odspep__adresses'), + ref('int_reseau_alpha__adresses'), ref('int_siao__adresses'), ref('int_soliguide__adresses'), ], diff --git a/pipeline/dbt/models/intermediate/int__union_services.sql b/pipeline/dbt/models/intermediate/int__union_services.sql index 537324960..b7eab9bf9 100644 --- a/pipeline/dbt/models/intermediate/int__union_services.sql +++ b/pipeline/dbt/models/intermediate/int__union_services.sql @@ -9,6 +9,7 @@ WITH services AS ( ref('int_mediation_numerique__services'), ref('int_monenfant__services'), ref('int_odspep__services'), + ref('int_reseau_alpha__services'), ref('int_soliguide__services'), ], column_override={ diff --git a/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql b/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql index 7ad393e4d..9db5ac58c 100644 --- a/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql +++ b/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql @@ -10,16 +10,22 @@ adresses AS ( SELECT * FROM {{ ref('int__union_adresses__enhanced') }} ), +departements AS ( + SELECT * FROM {{ source('insee', 'departements') }} +), + -- TODO: Refactoring needed to be able to do geocoding per source and then use the result in the mapping services_with_zone_diffusion AS ( SELECT {{ dbt_utils.star(from=ref('int__union_services'), relation_alias='services', except=["zone_diffusion_code", "zone_diffusion_nom"]) }}, - CASE services.source = ANY(ARRAY['monenfant', 'soliguide']) OR services.source ~ 'mediation-numerique' - WHEN TRUE THEN adresses.result_citycode + CASE + WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) OR services.source ~ 'mediation-numerique' THEN adresses.result_citycode + WHEN services.source = 'reseau-alpha' THEN LEFT(adresses.result_citycode, 2) ELSE services.zone_diffusion_code END AS "zone_diffusion_code", - CASE services.source = ANY(ARRAY['monenfant', 'soliguide']) OR services.source ~ 'mediation-numerique' - WHEN TRUE THEN adresses.commune + CASE + WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) OR services.source ~ 'mediation-numerique' THEN adresses.commune + WHEN services.source = 'reseau-alpha' THEN (SELECT departements."LIBELLE" FROM departements WHERE departements."DEP" = LEFT(adresses.result_citycode, 2)) ELSE services.zone_diffusion_nom END AS "zone_diffusion_nom" FROM diff --git a/pipeline/dbt/models/intermediate/int__union_structures.sql b/pipeline/dbt/models/intermediate/int__union_structures.sql index 4d870dc6b..fe6e264b1 100644 --- a/pipeline/dbt/models/intermediate/int__union_structures.sql +++ b/pipeline/dbt/models/intermediate/int__union_structures.sql @@ -14,6 +14,7 @@ WITH structures AS ( ref('int_mes_aides__structures'), ref('int_monenfant__structures'), ref('int_odspep__structures'), + ref('int_reseau_alpha__structures'), ref('int_siao__structures'), ref('int_soliguide__structures'), ], diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/_reseau_alpha__models.yml b/pipeline/dbt/models/intermediate/reseau_alpha/_reseau_alpha__models.yml new file mode 100644 index 000000000..3e11c6fe4 --- /dev/null +++ b/pipeline/dbt/models/intermediate/reseau_alpha/_reseau_alpha__models.yml @@ -0,0 +1,56 @@ +version: 2 + +models: + - name: int_reseau_alpha__adresses + tests: + - check_adresse: + config: + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + + - name: int_reseau_alpha__services + tests: + - check_service: + config: + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_reseau_alpha__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_reseau_alpha__adresses') + field: id + + - name: int_reseau_alpha__structures + tests: + - check_structure: + config: + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_reseau_alpha__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql new file mode 100644 index 000000000..694c1f268 --- /dev/null +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql @@ -0,0 +1,43 @@ +WITH structures AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__structures') }} +), + +formations AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__formations') }} +), + +structure_adresses AS ( + SELECT + _di_source_id AS "source", + adresses__longitude AS "longitude", + adresses__latitude AS "latitude", + NULL AS "complement_adresse", + adresses__ville AS "commune", + content__adresse AS "adresse", + adresses__code_postal AS "code_postal", + NULL AS "code_insee", + 'structure--' || id AS "id" + FROM structures +), + +formation_adresses AS ( + SELECT + _di_source_id AS "source", + adresses__longitude AS "longitude", + adresses__latitude AS "latitude", + NULL AS "complement_adresse", + adresses__ville AS "commune", + content__adresse AS "adresse", + adresses__code_postal AS "code_postal", + NULL AS "code_insee", + 'service--' || id AS "id" + FROM formations +), + +final AS ( + SELECT * FROM structure_adresses + UNION ALL + SELECT * FROM formation_adresses +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql new file mode 100644 index 000000000..1e508b888 --- /dev/null +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql @@ -0,0 +1,64 @@ +WITH formations AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__formations') }} +), + +structures AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__structures') }} +), + +final AS ( + SELECT + TRUE AS "contact_public", + formations.content__contact_prenom_nom AS "contact_nom_prenom", + formations.content__courriel AS "courriel", + NULL AS "formulaire_en_ligne", + NULL AS "frais_autres", + formations.nom AS "nom", + NULL AS "presentation_resume", + NULL AS "prise_rdv", + formations.content__horaires AS "recurrence", + formations._di_source_id AS "source", + formations.structure_id AS "structure_id", + formations.content__telephone AS "telephone", + NULL AS "zone_diffusion_code", -- FIXME + NULL AS "zone_diffusion_nom", + 'departement' AS "zone_diffusion_type", + TRUE AS "cumulable", + formations.url AS "lien_source", + formations.id AS "id", + formations.content__date_maj AS "date_maj", + NULL AS "modes_orientation_accompagnateur_autres", + NULL AS "modes_orientation_beneficiaire_autres", + ARRAY_TO_STRING( + ARRAY[ + formations.content__contenu_et_objectifs, + formations.content__public_attendu, + formations.content__inscription, + formations.content__informations_pratiques + ], + E'\n\n' + ) AS "presentation_detail", + 'service--' || formations.id AS "adresse_id", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "pre_requis", + CAST(NULL AS DATE) AS "date_suspension", + CAST(NULL AS DATE) AS "date_creation", + ARRAY_REMOVE( + ARRAY[ + 'apprendre-francais--suivre-formation', + CASE WHEN formations.activite = 'Français à visée professionnelle' THEN 'apprendre-francais--accompagnement-insertion-pro' END, + CASE WHEN formations.activite = 'Français à visée sociale et communicative' THEN 'apprendre-francais--communiquer-vie-tous-les-jours' END + ], + NULL + ) AS "thematiques", + ARRAY['en-presentiel'] AS "modes_accueil", + CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", + CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", + CAST(NULL AS TEXT []) AS "profils", + ARRAY['formation'] AS "types", + CAST(NULL AS TEXT []) AS "frais" + FROM formations + LEFT JOIN structures ON formations.structure_id = structures.id +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__structures.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__structures.sql new file mode 100644 index 000000000..48faac143 --- /dev/null +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__structures.sql @@ -0,0 +1,30 @@ +WITH structures AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__structures') }} +), + +final AS ( + SELECT + NULL AS "accessibilite", + content__courriel AS "courriel", + NULL AS "horaires_ouverture", + id AS "id", + url AS "lien_source", + nom AS "nom", + description AS "presentation_detail", + NULL AS "presentation_resume", + NULL AS "rna", + NULL AS "siret", + content__site_web AS "site_web", + _di_source_id AS "source", + content__telephone AS "telephone", + NULL AS "typologie", + content__date_maj AS "date_maj", + 'structure--' || id AS "adresse_id", + CAST(NULL AS BOOLEAN) AS "antenne", + CAST(NULL AS TEXT []) AS "labels_autres", + CAST(NULL AS TEXT []) AS "labels_nationaux", + CAST(NULL AS TEXT []) AS "thematiques" + FROM structures +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/staging/reseau_alpha/_reseau_alpha__models.yml b/pipeline/dbt/models/staging/reseau_alpha/_reseau_alpha__models.yml new file mode 100644 index 000000000..3aea7ab94 --- /dev/null +++ b/pipeline/dbt/models/staging/reseau_alpha/_reseau_alpha__models.yml @@ -0,0 +1,24 @@ +version: 2 + +models: + - name: stg_reseau_alpha__formations + config: + tags: reseau_alpha + columns: + - name: structure_id + tests: + - not_null + - dbt_utils.not_empty_string + - name: id + tests: + - not_null + - dbt_utils.not_empty_string + + - name: stg_reseau_alpha__structures + config: + tags: reseau_alpha + columns: + - name: id + tests: + - not_null + - dbt_utils.not_empty_string diff --git a/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql new file mode 100644 index 000000000..760473e37 --- /dev/null +++ b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql @@ -0,0 +1,71 @@ +WITH source AS ( + SELECT * FROM {{ source('reseau_alpha', 'formations') }} +), + +adresses AS ( + SELECT + -- extracted from cartographie.json + source.data ->> 'id' AS "formation_id", + adresses.data ->> 'ville' AS "adresses__ville", + CAST(adresses.data ->> 'latitude' AS FLOAT) AS "adresses__latitude", + CAST(adresses.data ->> 'longitude' AS FLOAT) AS "adresses__longitude", + adresses.data ->> 'codePostal' AS "adresses__code_postal", + TRIM(SUBSTRING(source.data ->> 'content__adresse' FROM '^(.+)\s\d{5} - .+$')) AS "content__adresse", + TRIM(source.data ->> 'content__horaires') AS "content__horaires" + FROM + source, + LATERAL(SELECT * FROM JSONB_PATH_QUERY(source.data, '$.adresses[*]')) AS adresses (data) + WHERE + -- a minority of formations have more than one addresses, which is not managed by + -- the data·inclusion schema. Skip these addresses. + JSONB_ARRAY_LENGTH(source.data -> 'adresses') = 1 +), + +final AS ( + SELECT + source._di_source_id AS "_di_source_id", + adresses.adresses__ville AS "adresses__ville", + adresses.adresses__latitude AS "adresses__latitude", + adresses.adresses__longitude AS "adresses__longitude", + adresses.adresses__code_postal AS "adresses__code_postal", + adresses.content__adresse AS "content__adresse", + adresses.content__horaires AS "content__horaires", + source.data ->> 'id' AS "id", + source.data ->> 'structure_id' AS "structure_id", + source.data ->> 'nom' AS "nom", + source.data ->> 'url' AS "url", + source.data ->> 'activite' AS "activite", + TO_DATE( + SUBSTRING( + ( + CASE + -- TODO: remove this after making fr_FR locale available + WHEN (source.data ->> 'content__date_maj') ~ 'janvier' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'janvier', '01') + WHEN (source.data ->> 'content__date_maj') ~ 'février' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'février', '02') + WHEN (source.data ->> 'content__date_maj') ~ 'mars' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'mars', '03') + WHEN (source.data ->> 'content__date_maj') ~ 'avril' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'avril', '04') + WHEN (source.data ->> 'content__date_maj') ~ 'mai' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'mai', '05') + WHEN (source.data ->> 'content__date_maj') ~ 'juin' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'juin', '06') + WHEN (source.data ->> 'content__date_maj') ~ 'juillet' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'juillet', '07') + WHEN (source.data ->> 'content__date_maj') ~ 'août' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'août', '08') + WHEN (source.data ->> 'content__date_maj') ~ 'septembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'septembre', '09') + WHEN (source.data ->> 'content__date_maj') ~ 'octobre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'octobre', '10') + WHEN (source.data ->> 'content__date_maj') ~ 'novembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'novembre', '11') + WHEN (source.data ->> 'content__date_maj') ~ 'décembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'décembre', '12') + END + ) FROM 'Date de la dernière modification : (.*)' + ), + 'DD MM YYYY' + ) AS "content__date_maj", + TRIM(source.data ->> 'content__contenu_et_objectifs') AS "content__contenu_et_objectifs", + TRIM(source.data ->> 'content__public_attendu') AS "content__public_attendu", + TRIM(source.data ->> 'content__inscription') AS "content__inscription", + TRIM(source.data ->> 'content__contact_prenom_nom') AS "content__contact_prenom_nom", + TRIM(source.data ->> 'content__telephone') AS "content__telephone", + TRIM(source.data ->> 'content__courriel') AS "content__courriel", + TRIM(source.data ->> 'content__informations_pratiques') AS "content__informations_pratiques" + FROM source + LEFT JOIN adresses ON source.data ->> 'id' = adresses.formation_id +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__structures.sql b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__structures.sql new file mode 100644 index 000000000..f325033b8 --- /dev/null +++ b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__structures.sql @@ -0,0 +1,67 @@ +WITH source AS ( + SELECT * FROM {{ source('reseau_alpha', 'structures') }} +), + +adresses AS ( + SELECT + -- extracted from cartographie.json + source.data ->> 'id' AS "structure_id", + adresses.data ->> 'ville' AS "adresses__ville", + CAST(adresses.data ->> 'latitude' AS FLOAT) AS "adresses__latitude", + CAST(adresses.data ->> 'longitude' AS FLOAT) AS "adresses__longitude", + adresses.data ->> 'codePostal' AS "adresses__code_postal", + TRIM(SUBSTRING(source.data ->> 'content__adresse' FROM '^(.+)\s\d{5} - .+$')) AS "content__adresse" + FROM + source, + LATERAL(SELECT * FROM JSONB_PATH_QUERY(source.data, '$.adresses[*]')) AS adresses (data) + WHERE + -- a minority of structures have more than one addresses, which is not managed by + -- the data·inclusion schema. Skip these addresses. + JSONB_ARRAY_LENGTH(source.data -> 'adresses') = 1 +), + +final AS ( + SELECT + source._di_source_id AS "_di_source_id", + adresses.adresses__ville AS "adresses__ville", + adresses.adresses__latitude AS "adresses__latitude", + adresses.adresses__longitude AS "adresses__longitude", + adresses.adresses__code_postal AS "adresses__code_postal", + adresses.content__adresse AS "content__adresse", + CAST(ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(source.data -> 'activitesFormation')) AS TEXT []) AS "activites_formation", + source.data ->> 'id' AS "id", + source.data ->> 'nom' AS "nom", + source.data ->> 'url' AS "url", + source.data ->> 'logo' AS "logo", + source.data ->> 'type' AS "type", + source.data ->> 'description' AS "description", + TO_DATE( + SUBSTRING( + ( + CASE + -- TODO: remove this after making fr_FR locale available + WHEN (source.data ->> 'content__date_maj') ~ 'janvier' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'janvier', '01') + WHEN (source.data ->> 'content__date_maj') ~ 'février' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'février', '02') + WHEN (source.data ->> 'content__date_maj') ~ 'mars' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'mars', '03') + WHEN (source.data ->> 'content__date_maj') ~ 'avril' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'avril', '04') + WHEN (source.data ->> 'content__date_maj') ~ 'mai' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'mai', '05') + WHEN (source.data ->> 'content__date_maj') ~ 'juin' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'juin', '06') + WHEN (source.data ->> 'content__date_maj') ~ 'juillet' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'juillet', '07') + WHEN (source.data ->> 'content__date_maj') ~ 'août' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'août', '08') + WHEN (source.data ->> 'content__date_maj') ~ 'septembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'septembre', '09') + WHEN (source.data ->> 'content__date_maj') ~ 'octobre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'octobre', '10') + WHEN (source.data ->> 'content__date_maj') ~ 'novembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'novembre', '11') + WHEN (source.data ->> 'content__date_maj') ~ 'décembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'décembre', '12') + END + ) FROM 'Date de la dernière modification : (.*)' + ), + 'DD MM YYYY' + ) AS "content__date_maj", + TRIM(source.data ->> 'content__telephone') AS "content__telephone", + TRIM(source.data ->> 'content__courriel') AS "content__courriel", + TRIM(source.data ->> 'content__site_web') AS "content__site_web" + FROM source + LEFT JOIN adresses ON source.data ->> 'id' = adresses.structure_id +) + +SELECT * FROM final diff --git a/pipeline/requirements/dev/requirements.txt b/pipeline/requirements/dev/requirements.txt index bc03dd358..17780236b 100644 --- a/pipeline/requirements/dev/requirements.txt +++ b/pipeline/requirements/dev/requirements.txt @@ -2,13 +2,13 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=requirements/dev/requirements.txt --resolver=backtracking requirements/dev/requirements.in +# pip-compile requirements/dev/requirements.in # aiohttp==3.8.5 # via apache-airflow-providers-http aiosignal==1.3.1 # via aiohttp -alembic==1.11.3 +alembic==1.12.0 # via apache-airflow anyio==4.0.0 # via httpcore @@ -76,17 +76,19 @@ backoff==2.2.1 backports-datetime-fromisoformat==2.0.0 # via htmldate beautifulsoup4==4.12.2 - # via redshift-connector + # via + # -r requirements/dev/../tasks/python/requirements.in + # redshift-connector black==23.7.0 # via -r requirements/dev/requirements.in blinker==1.6.2 # via apache-airflow -boto3==1.28.38 +boto3==1.28.40 # via # apache-airflow-providers-amazon # redshift-connector # watchtower -botocore==1.31.38 +botocore==1.31.40 # via # apache-airflow-providers-amazon # boto3 @@ -298,7 +300,7 @@ langcodes==3.3.0 # via courlan lazy-object-proxy==1.9.0 # via apache-airflow -limits==3.5.0 +limits==3.6.0 # via flask-limiter linkify-it-py==2.0.2 # via apache-airflow @@ -425,7 +427,7 @@ pluggy==1.3.0 # tox ply==3.11 # via jsonpath-ng -pre-commit==3.3.3 +pre-commit==3.4.0 # via -r requirements/dev/requirements.in prison==0.2.1 # via flask-appbuilder @@ -462,7 +464,7 @@ pyproj==3.6.0 # via geopandas pyproject-api==1.6.1 # via tox -pytest==7.4.0 +pytest==7.4.1 # via -r requirements/dev/requirements.in python-daemon==3.0.1 # via apache-airflow @@ -482,7 +484,7 @@ python-slugify==8.0.1 # via # apache-airflow # python-nvd3 -pytz==2023.3 +pytz==2023.3.post1 # via # dateparser # flask-babel @@ -524,7 +526,7 @@ rich==13.5.2 # rich-argparse rich-argparse==1.3.0 # via apache-airflow -rpds-py==0.10.0 +rpds-py==0.10.2 # via # jsonschema # referencing @@ -548,7 +550,7 @@ sniffio==1.3.0 # anyio # httpcore # httpx -soupsieve==2.4.1 +soupsieve==2.5 # via beautifulsoup4 sqlalchemy==1.4.49 # via @@ -588,7 +590,7 @@ tomli==2.0.1 # pyproject-api # pytest # tox -tox==4.11.0 +tox==4.11.1 # via -r requirements/dev/requirements.in tqdm==4.66.1 # via -r requirements/dev/../tasks/python/requirements.in diff --git a/pipeline/requirements/tasks/python/requirements.in b/pipeline/requirements/tasks/python/requirements.in index 61c068304..9a40fe714 100644 --- a/pipeline/requirements/tasks/python/requirements.in +++ b/pipeline/requirements/tasks/python/requirements.in @@ -1,6 +1,7 @@ apache-airflow-providers-postgres apache-airflow-providers-amazon +beautifulsoup4~=4.12.2 GeoAlchemy2 geopandas~=0.13.2 openpyxl~=3.1.2 diff --git a/pipeline/requirements/tasks/python/requirements.txt b/pipeline/requirements/tasks/python/requirements.txt index 32550f884..56da60a45 100644 --- a/pipeline/requirements/tasks/python/requirements.txt +++ b/pipeline/requirements/tasks/python/requirements.txt @@ -2,13 +2,13 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=requirements/tasks/python/requirements.txt --resolver=backtracking requirements/tasks/python/requirements.in +# pip-compile requirements/tasks/python/requirements.in # aiohttp==3.8.5 # via apache-airflow-providers-http aiosignal==1.3.1 # via aiohttp -alembic==1.11.3 +alembic==1.12.0 # via apache-airflow anyio==4.0.0 # via httpcore @@ -71,15 +71,17 @@ backoff==2.2.1 backports-datetime-fromisoformat==2.0.0 # via htmldate beautifulsoup4==4.12.2 - # via redshift-connector + # via + # -r requirements/tasks/python/requirements.in + # redshift-connector blinker==1.6.2 # via apache-airflow -boto3==1.28.38 +boto3==1.28.40 # via # apache-airflow-providers-amazon # redshift-connector # watchtower -botocore==1.31.38 +botocore==1.31.40 # via # apache-airflow-providers-amazon # boto3 @@ -271,7 +273,7 @@ langcodes==3.3.0 # via courlan lazy-object-proxy==1.9.0 # via apache-airflow -limits==3.5.0 +limits==3.6.0 # via flask-limiter linkify-it-py==2.0.2 # via apache-airflow @@ -429,7 +431,7 @@ python-slugify==8.0.1 # via # apache-airflow # python-nvd3 -pytz==2023.3 +pytz==2023.3.post1 # via # dateparser # flask-babel @@ -470,7 +472,7 @@ rich==13.5.2 # rich-argparse rich-argparse==1.3.0 # via apache-airflow -rpds-py==0.10.0 +rpds-py==0.10.2 # via # jsonschema # referencing @@ -494,7 +496,7 @@ sniffio==1.3.0 # anyio # httpcore # httpx -soupsieve==2.4.1 +soupsieve==2.5 # via beautifulsoup4 sqlalchemy==1.4.49 # via diff --git a/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py b/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py new file mode 100644 index 000000000..effbdeb87 --- /dev/null +++ b/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py @@ -0,0 +1,205 @@ +import io +import json +import logging +import tarfile +import time +from pathlib import Path +from typing import Optional + +import bs4 +import numpy as np +import pandas as pd +import requests +import trafilatura +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +def log_and_raise(resp: requests.Response, *args, **kwargs): + try: + resp.raise_for_status() + except requests.HTTPError as err: + logger.error(resp.json()) + raise err + + +def extract_structures(url: str, **kwargs) -> bytes: + url = url.lstrip("/") + + session = requests.Session() + session.hooks["response"] = [log_and_raise] + + response = session.get(url + "/cartographie.json") + data = response.json() + + structures_df = pd.DataFrame.from_records(data["structures"]) + + with io.BytesIO() as out_buf: + with tarfile.open(fileobj=out_buf, mode="w:gz") as tar: + with io.BytesIO(response.content) as buf: + tar_info = tarfile.TarInfo("metadata.json") + tar_info.size = len(response.content) + tar.addfile(tar_info, buf) + + for _, row in tqdm(structures_df.iterrows()): + response = session.get(row.url) + + with io.BytesIO(response.content) as buf: + tar_info = tarfile.TarInfo(f"{row.id}.html") + tar_info.size = len(response.content) + tar.addfile(tar_info, buf) + + time.sleep(0.1) + return out_buf.getvalue() + + +def extract_formations(url: str, **kwargs) -> bytes: + url = url.lstrip("/") + + session = requests.Session() + session.hooks["response"] = [log_and_raise] + + response = session.get(url + "/cartographie.json") + data = response.json() + + formations_df = pd.json_normalize( + data["structures"], + record_path="formations", + meta="id", + meta_prefix="structure_", + max_level=0, + ) + + with io.BytesIO() as out_buf: + with tarfile.open(fileobj=out_buf, mode="w:gz") as tar: + with io.BytesIO(response.content) as buf: + tar_info = tarfile.TarInfo("metadata.json") + tar_info.size = len(response.content) + tar.addfile(tar_info, buf) + + for _, row in tqdm(formations_df.iterrows()): + response = session.get(row.url) + + with io.BytesIO(response.content) as buf: + tar_info = tarfile.TarInfo(f"{row.id}.html") + tar_info.size = len(response.content) + tar.addfile(tar_info, buf) + + time.sleep(0.1) + return out_buf.getvalue() + + +def scrap_structure_html(html_path: Path) -> dict: + with html_path.open() as f: + soup = bs4.BeautifulSoup(f, features="lxml") + data = {} + + NODE_BY_CONTENT_NAME = { + "adresse": soup.select_one(".adresse"), + "date_maj": soup.find(class_="structures-dates").find( + string=lambda text: "Date de la dernière modification :" in text + ), + "telephone": soup.select_one(".telephone > a"), + "site_web": soup.select_one(".contact-content").find( + string=lambda t: t.startswith("http://") + ), + "courriel": soup.select_one(".email > a:nth-child(1)"), + } + + for content_name, node in NODE_BY_CONTENT_NAME.items(): + data[f"content__{content_name}"] = html_to_markdown(node) + + return data + + +def scrap_formation_html(html_path: Path) -> dict: + with html_path.open() as f: + soup = bs4.BeautifulSoup(f, features="lxml") + data = {} + + NODE_BY_CONTENT_NAME = { + "contenu_et_objectifs": soup.select_one( + "div.container:nth-child(2) > div:nth-child(2)" + " > div:nth-child(1) > div:nth-child(1)" + ), + "date_maj": soup.select_one(".entete").find( + string=lambda text: "Date de la dernière modification :" in text + ), + "public_attendu": soup.select_one( + "div.container:nth-child(2) > div:nth-child(2)" + " > div:nth-child(1) > div:nth-child(2)" + ), + "inscription": soup.select_one( + "div.col-lg-6:nth-child(2) > div:nth-child(1)" + ), + "contact_prenom_nom": soup.select_one( + "#formation-inscription > div:nth-child(2) > div:nth-child(2)" + ), + "telephone": soup.select_one("#formation-inscription > div:nth-child(3)"), + "courriel": soup.select_one(".email > a:nth-child(1)"), + "informations_pratiques": soup.select_one( + "div.col-lg-6:nth-child(2) > div:nth-child(3)" + ), + "adresse": soup.select_one(".col-sm-9 > div:nth-child(2)"), + "horaires": "".join( + soup.select_one(".col-sm-9").find_all( + string=lambda text: "de" in text and "à" in text + ) + ), + } + + for content_name, node in NODE_BY_CONTENT_NAME.items(): + data[f"content__{content_name}"] = html_to_markdown(node) + + return data + + +def html_to_markdown(s) -> Optional[str]: + if s is None or s == "": + return s + return trafilatura.extract(trafilatura.load_html("" + str(s) + "")) + + +def read_structures(path: Path) -> pd.DataFrame: + with tarfile.open(path, "r:gz") as tar: + tar.extractall(path=path.parent) + + with (path.parent / "metadata.json").open() as f: + df = pd.DataFrame.from_records(json.load(f)["structures"]) + + df = df.join( + df.apply( + lambda row: scrap_structure_html(html_path=path.parent / f"{row.id}.html"), + axis=1, + result_type="expand", + ) + ) + df = df.replace({np.nan: None}) + + return df + + +def read_formations(path: Path) -> pd.DataFrame: + with tarfile.open(path, "r:gz") as tar: + tar.extractall(path=path.parent) + + with (path.parent / "metadata.json").open() as f: + df = pd.json_normalize( + json.load(f)["structures"], + record_path="formations", + meta="id", + meta_prefix="structure_", + max_level=0, + ) + + df = df.join( + df.apply( + lambda row: scrap_formation_html(html_path=path.parent / f"{row.id}.html"), + axis=1, + result_type="expand", + ) + ) + df = df.replace({np.nan: None}) + + return df From 5f0faff699955c002ff566cc1546c3981f7684f4 Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Wed, 27 Sep 2023 17:27:12 +0200 Subject: [PATCH 34/34] feat(reseau-alpha): update --- .../int_reseau_alpha__adresses.sql | 18 +-- .../int_reseau_alpha__services.sql | 106 +++++++++----- .../stg_reseau_alpha__formations.sql | 66 +++++---- .../scripts/tasks/reseau_alpha.py | 131 +++++++++++++++--- 4 files changed, 229 insertions(+), 92 deletions(-) diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql index 694c1f268..4bae773a6 100644 --- a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql @@ -22,15 +22,15 @@ structure_adresses AS ( formation_adresses AS ( SELECT - _di_source_id AS "source", - adresses__longitude AS "longitude", - adresses__latitude AS "latitude", - NULL AS "complement_adresse", - adresses__ville AS "commune", - content__adresse AS "adresse", - adresses__code_postal AS "code_postal", - NULL AS "code_insee", - 'service--' || id AS "id" + _di_source_id AS "source", + adresses__longitude AS "longitude", + adresses__latitude AS "latitude", + NULL AS "complement_adresse", + adresses__ville AS "commune", + content__lieux_et_horaires_formation__adresse AS "adresse", + adresses__code_postal AS "code_postal", + NULL AS "code_insee", + 'service--' || id AS "id" FROM formations ), diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql index 1e508b888..e94a2b76a 100644 --- a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql @@ -8,41 +8,59 @@ structures AS ( final AS ( SELECT - TRUE AS "contact_public", - formations.content__contact_prenom_nom AS "contact_nom_prenom", - formations.content__courriel AS "courriel", - NULL AS "formulaire_en_ligne", - NULL AS "frais_autres", - formations.nom AS "nom", - NULL AS "presentation_resume", - NULL AS "prise_rdv", - formations.content__horaires AS "recurrence", - formations._di_source_id AS "source", - formations.structure_id AS "structure_id", - formations.content__telephone AS "telephone", - NULL AS "zone_diffusion_code", -- FIXME - NULL AS "zone_diffusion_nom", - 'departement' AS "zone_diffusion_type", - TRUE AS "cumulable", - formations.url AS "lien_source", - formations.id AS "id", - formations.content__date_maj AS "date_maj", - NULL AS "modes_orientation_accompagnateur_autres", - NULL AS "modes_orientation_beneficiaire_autres", + TRUE AS "contact_public", + formations.content__contact_inscription__contact AS "contact_nom_prenom", + formations.content__contact_inscription__courriel AS "courriel", + formations.content__inscription__informations_en_ligne AS "formulaire_en_ligne", + NULL AS "frais_autres", + formations.nom AS "nom", + NULL AS "prise_rdv", + formations.content__lieux_et_horaires_formation__horaires AS "recurrence", + formations._di_source_id AS "source", + formations.structure_id AS "structure_id", + formations.content__contact_inscription__telephone AS "telephone", + NULL AS "zone_diffusion_code", + NULL AS "zone_diffusion_nom", -- FIXME + 'departement' AS "zone_diffusion_type", + TRUE AS "cumulable", + formations.url AS "lien_source", + formations.id AS "id", + formations.content__date_maj AS "date_maj", + NULL AS "modes_orientation_accompagnateur_autres", + NULL AS "modes_orientation_beneficiaire_autres", + CASE + WHEN LENGTH(formations.content__contenu_et_objectifs__titre) <= 280 + THEN formations.content__contenu_et_objectifs__titre + ELSE LEFT(formations.content__contenu_et_objectifs__titre, 279) || '…' + END AS "presentation_resume", ARRAY_TO_STRING( ARRAY[ - formations.content__contenu_et_objectifs, - formations.content__public_attendu, - formations.content__inscription, - formations.content__informations_pratiques + '# Contenu et objectifs de la formation', + formations.content__contenu_et_objectifs__titre, + formations.content__contenu_et_objectifs__objectifs, + formations.content__contenu_et_objectifs__niveau, + '# Public attendu', + formations.content__public_attendu__niveau, + formations.content__public_attendu__competences, + formations.content__public_attendu__type_de_public, + '# Inscription', + formations.content__inscription__places, + formations.content__inscription__entree_sortie, + '# Informations pratiques', + formations.content__informations_pratiques__etendue, + formations.content__informations_pratiques__volume, + formations.content__informations_pratiques__cout, + formations.content__informations_pratiques__prise_en_charge, + formations.content__informations_pratiques__remuneration, + formations.content__informations_pratiques__garde ], E'\n\n' - ) AS "presentation_detail", - 'service--' || formations.id AS "adresse_id", - CAST(NULL AS TEXT []) AS "justificatifs", - CAST(NULL AS TEXT []) AS "pre_requis", - CAST(NULL AS DATE) AS "date_suspension", - CAST(NULL AS DATE) AS "date_creation", + ) AS "presentation_detail", + 'service--' || formations.id AS "adresse_id", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "pre_requis", + CAST(NULL AS DATE) AS "date_suspension", + CAST(NULL AS DATE) AS "date_creation", ARRAY_REMOVE( ARRAY[ 'apprendre-francais--suivre-formation', @@ -50,13 +68,25 @@ final AS ( CASE WHEN formations.activite = 'Français à visée sociale et communicative' THEN 'apprendre-francais--communiquer-vie-tous-les-jours' END ], NULL - ) AS "thematiques", - ARRAY['en-presentiel'] AS "modes_accueil", - CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", - CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", - CAST(NULL AS TEXT []) AS "profils", - ARRAY['formation'] AS "types", - CAST(NULL AS TEXT []) AS "frais" + ) AS "thematiques", + ARRAY['en-presentiel'] AS "modes_accueil", + ARRAY_REMOVE( + ARRAY[ + CASE WHEN formations.content__contact_inscription__courriel IS NOT NULL THEN 'envoyer-un-mail' END, + CASE WHEN formations.content__contact_inscription__telephone IS NOT NULL THEN 'telephoner' END + ], + NULL + ) AS "modes_orientation_accompagnateur", + ARRAY_REMOVE( + ARRAY[ + CASE WHEN formations.content__contact_inscription__courriel IS NOT NULL THEN 'envoyer-un-mail' END, + CASE WHEN formations.content__contact_inscription__telephone IS NOT NULL THEN 'telephoner' END + ], + NULL + ) AS "modes_orientation_beneficiaire", + ARRAY['public-langues-etrangeres'] AS "profils", + ARRAY['formation'] AS "types", + CAST(NULL AS TEXT []) AS "frais" FROM formations LEFT JOIN structures ON formations.structure_id = structures.id ) diff --git a/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql index 760473e37..de753cc82 100644 --- a/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql +++ b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql @@ -5,13 +5,13 @@ WITH source AS ( adresses AS ( SELECT -- extracted from cartographie.json - source.data ->> 'id' AS "formation_id", - adresses.data ->> 'ville' AS "adresses__ville", - CAST(adresses.data ->> 'latitude' AS FLOAT) AS "adresses__latitude", - CAST(adresses.data ->> 'longitude' AS FLOAT) AS "adresses__longitude", - adresses.data ->> 'codePostal' AS "adresses__code_postal", - TRIM(SUBSTRING(source.data ->> 'content__adresse' FROM '^(.+)\s\d{5} - .+$')) AS "content__adresse", - TRIM(source.data ->> 'content__horaires') AS "content__horaires" + source.data ->> 'id' AS "formation_id", + adresses.data ->> 'ville' AS "adresses__ville", + CAST(adresses.data ->> 'latitude' AS FLOAT) AS "adresses__latitude", + CAST(adresses.data ->> 'longitude' AS FLOAT) AS "adresses__longitude", + adresses.data ->> 'codePostal' AS "adresses__code_postal", + TRIM(SUBSTRING(source.data ->> 'content__lieux_et_horaires_formation__adresse' FROM '^(.+)\s\d{5} - .+$')) AS "content__lieux_et_horaires_formation__adresse", + TRIM(source.data ->> 'content__lieux_et_horaires_formation__horaires') AS "content__lieux_et_horaires_formation__horaires" FROM source, LATERAL(SELECT * FROM JSONB_PATH_QUERY(source.data, '$.adresses[*]')) AS adresses (data) @@ -23,18 +23,18 @@ adresses AS ( final AS ( SELECT - source._di_source_id AS "_di_source_id", - adresses.adresses__ville AS "adresses__ville", - adresses.adresses__latitude AS "adresses__latitude", - adresses.adresses__longitude AS "adresses__longitude", - adresses.adresses__code_postal AS "adresses__code_postal", - adresses.content__adresse AS "content__adresse", - adresses.content__horaires AS "content__horaires", - source.data ->> 'id' AS "id", - source.data ->> 'structure_id' AS "structure_id", - source.data ->> 'nom' AS "nom", - source.data ->> 'url' AS "url", - source.data ->> 'activite' AS "activite", + source._di_source_id AS "_di_source_id", + adresses.adresses__ville AS "adresses__ville", + adresses.adresses__latitude AS "adresses__latitude", + adresses.adresses__longitude AS "adresses__longitude", + adresses.adresses__code_postal AS "adresses__code_postal", + adresses.content__lieux_et_horaires_formation__adresse AS "content__lieux_et_horaires_formation__adresse", + adresses.content__lieux_et_horaires_formation__horaires AS "content__lieux_et_horaires_formation__horaires", + source.data ->> 'id' AS "id", + source.data ->> 'structure_id' AS "structure_id", + source.data ->> 'nom' AS "nom", + source.data ->> 'url' AS "url", + source.data ->> 'activite' AS "activite", TO_DATE( SUBSTRING( ( @@ -56,14 +56,26 @@ final AS ( ) FROM 'Date de la dernière modification : (.*)' ), 'DD MM YYYY' - ) AS "content__date_maj", - TRIM(source.data ->> 'content__contenu_et_objectifs') AS "content__contenu_et_objectifs", - TRIM(source.data ->> 'content__public_attendu') AS "content__public_attendu", - TRIM(source.data ->> 'content__inscription') AS "content__inscription", - TRIM(source.data ->> 'content__contact_prenom_nom') AS "content__contact_prenom_nom", - TRIM(source.data ->> 'content__telephone') AS "content__telephone", - TRIM(source.data ->> 'content__courriel') AS "content__courriel", - TRIM(source.data ->> 'content__informations_pratiques') AS "content__informations_pratiques" + ) AS "content__date_maj", + TRIM(source.data ->> 'content__contenu_et_objectifs__titre') AS "content__contenu_et_objectifs__titre", + TRIM(source.data ->> 'content__contenu_et_objectifs__objectifs') AS "content__contenu_et_objectifs__objectifs", + TRIM(source.data ->> 'content__contenu_et_objectifs__niveau') AS "content__contenu_et_objectifs__niveau", + TRIM(source.data ->> 'content__public_attendu__niveau') AS "content__public_attendu__niveau", + TRIM(source.data ->> 'content__public_attendu__competences') AS "content__public_attendu__competences", + TRIM(source.data ->> 'content__public_attendu__type_de_public') AS "content__public_attendu__type_de_public", + TRIM(source.data ->> 'content__inscription__informations_en_ligne') AS "content__inscription__informations_en_ligne", + TRIM(source.data ->> 'content__inscription__places') AS "content__inscription__places", + TRIM(source.data ->> 'content__inscription__entree_sortie') AS "content__inscription__entree_sortie", + TRIM(source.data ->> 'content__contact_inscription__adresse') AS "content__contact_inscription__adresse", + TRIM(source.data ->> 'content__contact_inscription__contact') AS "content__contact_inscription__contact", + TRIM(source.data ->> 'content__contact_inscription__telephone') AS "content__contact_inscription__telephone", + TRIM(source.data ->> 'content__contact_inscription__courriel') AS "content__contact_inscription__courriel", + TRIM(source.data ->> 'content__informations_pratiques__etendue') AS "content__informations_pratiques__etendue", + TRIM(source.data ->> 'content__informations_pratiques__volume') AS "content__informations_pratiques__volume", + TRIM(source.data ->> 'content__informations_pratiques__cout') AS "content__informations_pratiques__cout", + TRIM(source.data ->> 'content__informations_pratiques__prise_en_charge') AS "content__informations_pratiques__prise_en_charge", + TRIM(source.data ->> 'content__informations_pratiques__remuneration') AS "content__informations_pratiques__remuneration", + TRIM(source.data ->> 'content__informations_pratiques__garde') AS "content__informations_pratiques__garde" FROM source LEFT JOIN adresses ON source.data ->> 'id' = adresses.formation_id ) diff --git a/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py b/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py index effbdeb87..672c702bf 100644 --- a/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py +++ b/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py @@ -114,36 +114,131 @@ def scrap_structure_html(html_path: Path) -> dict: def scrap_formation_html(html_path: Path) -> dict: + def get_parent(node): + return node.parent if node is not None else None + with html_path.open() as f: soup = bs4.BeautifulSoup(f, features="lxml") data = {} + contenu_et_objectifs_selector = ( + ".container > .row:nth-child(2) > div:nth-child(1) > .row:nth-child(1)" + ) + public_attendu_selector = ( + ".container > .row:nth-child(2) > div:nth-child(1) > .row:nth-child(2)" + ) + inscription_selector = ( + ".container > .row:nth-child(2) > div:nth-child(2) > .row:nth-child(1)" + ) + informations_pratiques_selector = ( + ".container > .row:nth-child(2) > div:nth-child(2) > .row:nth-child(3)" + ) + NODE_BY_CONTENT_NAME = { - "contenu_et_objectifs": soup.select_one( - "div.container:nth-child(2) > div:nth-child(2)" - " > div:nth-child(1) > div:nth-child(1)" - ), "date_maj": soup.select_one(".entete").find( string=lambda text: "Date de la dernière modification :" in text ), - "public_attendu": soup.select_one( - "div.container:nth-child(2) > div:nth-child(2)" - " > div:nth-child(1) > div:nth-child(2)" + "contenu_et_objectifs__titre": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(2)" ), - "inscription": soup.select_one( - "div.col-lg-6:nth-child(2) > div:nth-child(1)" + "contenu_et_objectifs__objectifs": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(3)" ), - "contact_prenom_nom": soup.select_one( - "#formation-inscription > div:nth-child(2) > div:nth-child(2)" + "contenu_et_objectifs__niveau": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(4)" ), - "telephone": soup.select_one("#formation-inscription > div:nth-child(3)"), - "courriel": soup.select_one(".email > a:nth-child(1)"), - "informations_pratiques": soup.select_one( - "div.col-lg-6:nth-child(2) > div:nth-child(3)" + "public_attendu__niveau": soup.select_one( + f"{public_attendu_selector} > div:nth-child(2)" + ), + "public_attendu__competences": soup.select_one( + f"{public_attendu_selector} > div:nth-child(3)" + ), + "public_attendu__type_de_public": soup.select_one( + f"{public_attendu_selector} > div:nth-child(4)" + ), + "inscription__informations_en_ligne": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Informations en ligne" in text + ) + ) + ), + "inscription__places": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Places disponibles" in text + ) + ) + ), + "inscription__entree_sortie": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Entrée / sortie permanente" in text + ) + ) + ), + "contact_inscription__adresse": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-home")) + ), + "contact_inscription__contact": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-user")) + ), + "contact_inscription__telephone": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-phone")) + ), + "contact_inscription__courriel": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-inbox")) + ), + "informations_pratiques__etendue": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Étendue de la formation" in text + ) + ) + ), + "informations_pratiques__volume": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Volume horaire" in text + ) + ) + ), + "informations_pratiques__cout": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: ( + "Adhésion annuelle à la structure obligatoire" + ) + in text + ) + ) + ), + "informations_pratiques__prise_en_charge": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Coût d'inscription à la formation" in text + ) + ) + ), + "informations_pratiques__remuneration": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Rémunération" in text + ) + ) + ), + "informations_pratiques__garde": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Garde d'enfant" in text + ) + ) + ), + "lieux_et_horaires_formation__adresse": soup.select_one( + "#lieux-formation .lieu-formation .adresse" ), - "adresse": soup.select_one(".col-sm-9 > div:nth-child(2)"), - "horaires": "".join( - soup.select_one(".col-sm-9").find_all( + "lieux_et_horaires_formation__horaires": "\n".join( + soup.select_one("#lieux-formation").find_all( string=lambda text: "de" in text and "à" in text ) ),