diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2318067c4..a92b914cb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: # api - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 23.7.0 hooks: - id: black name: api|black @@ -41,7 +41,7 @@ repos: # pipeline - repo: https://github.com/psf/black - rev: 22.10.0 + rev: 23.7.0 hooks: - id: black name: pipeline|black @@ -105,7 +105,7 @@ repos: files: ^siretisation exclude: ^siretisation/django(/.*)*/static/vendor - repo: https://github.com/psf/black - rev: 22.10.0 + rev: 23.7.0 hooks: - id: black name: siretisation|black diff --git a/.template.env b/.template.env index f4fdc1c39..9bd28547d 100644 --- a/.template.env +++ b/.template.env @@ -35,12 +35,15 @@ ANNUAIRE_ENTREPRISES_API_URL=https://recherche-entreprises.api.gouv.fr ### sources ### +# airflow connections string *must* be urlencoded (using `urllib.parse.urlencode` for instance) + AGEFIPH_SERVICES_API_URL=https://www.agefiph.fr/jsonapi/node/aide_service AGEFIPH_STRUCTURES_FILE_URL= AIRFLOW_CONN_S3_SOURCES= BAN_API_URL=https://api-adresse.data.gouv.fr CD35_FILE_URL=https://data.ille-et-vilaine.fr/dataset/8d5ec0f0-ebe1-442d-9d99-655b37d5ad07/resource/8b781e9d-e11d-486c-98cf-0f63abfae8ed/download/annuaire_sociale_fixe.csv -CD72_FILE_URL= +CD72_STRUCTURES_FILE_URL=https://grist.incubateur.net/o/datainclusion/api/docs/dFpXXzs2fug9Kb7zZhyWyn/download/csv?tableId=Structures +CD72_SERVICES_FILE_URL=https://grist.incubateur.net/o/datainclusion/api/docs/dFpXXzs2fug9Kb7zZhyWyn/download/csv?tableId=Services DI_EXTRA_SERVICES_FILE_URL=https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/data-inclusion/2023-08-16/services.json DI_EXTRA_STRUCTURES_FILE_URL=https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/data-inclusion/2023-08-16/structures.json DORA_API_TOKEN= @@ -49,6 +52,7 @@ EMPLOIS_API_TOKEN= EMPLOIS_API_URL=https://emplois.inclusion.beta.gouv.fr/api/v1/structures/ ETAB_PUB_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/73302880-e4df-4d4c-8676-1a61bb997f3d FINESS_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/3dc9b1d5-0157-440d-a7b5-c894fcfdfd45 +GRIST_API_TOKEN= IGN_ADMIN_EXPRESS_FILE_URL=http://files.opendatarchives.fr/professionnels.ign.fr/adminexpress/ADMIN-EXPRESS-COG_3-0__SHP__FRA_WM_2021-05-19.7z IMMERSION_FACILITEE_S3_KEY_PREFIX=sources/immersion-facilitee/2023-03-06/after-siretisation-auto/ INSEE_FIRSTNAME_FILE_URL=https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip @@ -85,6 +89,7 @@ MES_AIDES_AIRTABLE_KEY= MES_AIDES_GARAGES_URL=https://airtable.com/appEvva5gyqqoQRnr/tblnGf4Y5EUEeVHtJ/viw9ZZAUkexq6uDaI MONENFANT_CRECHES_FILE_URL= ODSPEP_S3_KEY_PREFIX=sources/odspep/2023-01-23/denormalized/Exports/ +RESEAU_ALPHA_URL=https://www.reseau-alpha.org SIAO_FILE_URL= SIRENE_STOCK_ETAB_GEOCODE_FILE_URL=https://data.cquest.org/geo_sirene/v2019/last/StockEtablissementActif_utf8_geo.csv.gz SIRENE_STOCK_ETAB_HIST_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/88fbb6b4-0320-443e-b739-b4376a012c32 diff --git a/.vscode/data-inclusion.code-workspace b/.vscode/data-inclusion.code-workspace index a4bb88cfa..5c16ac9ee 100644 --- a/.vscode/data-inclusion.code-workspace +++ b/.vscode/data-inclusion.code-workspace @@ -17,6 +17,9 @@ }, { "path": ".." + }, + { + "path": "../../dora-back" } ] } \ No newline at end of file diff --git a/README.md b/README.md index 63b295050..86d75a6e5 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Les données collectées sont: * enrichies via les outils développés par data·inclusion: * l'outil de correspondance, qui permet de faire correspondre 2 jeux de données brutes, * l'outil de sirétisation, qui permet d'attribuer un siret aux structures, afin de croiser, -* alignées sur le [schéma de données de data·inclusion](https://schema.data.gouv.fr/betagouv/data-inclusion-schema/) +* alignées sur le [schéma de données de data·inclusion](https://schema.data.gouv.fr/gip-inclusion/data-inclusion-schema/) * publiées régulièrement en [open data sur data.gouv](https://www.data.gouv.fr/fr/datasets/referentiel-de-loffre-dinsertion-liste-des-structures-et-services-dinsertion/), la plateforme de données publiques, * consultables via une api. diff --git a/analyse/.template.env b/analyse/.template.env index bb5187f29..bdd2044a1 100644 --- a/analyse/.template.env +++ b/analyse/.template.env @@ -8,4 +8,5 @@ FINESS_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/3dc9b1d5-0157-440d-a7b5-c CD72_FILE_URL= CD93_FILE_URL= CD35_FILE_URL=https://data.ille-et-vilaine.fr/dataset/8d5ec0f0-ebe1-442d-9d99-655b37d5ad07/resource/665776ae-fa25-46ab-9bfd-c4241866f03f/download/annuaire_sociale_fixe.csv -CD62_FILE_URL= \ No newline at end of file +CD62_FILE_URL= +RESEAU_ALPHA_TEST_W_LOCAL_FILES=0 \ No newline at end of file diff --git a/analyse/notebooks/grist/template.ipynb b/analyse/notebooks/grist/template.ipynb new file mode 100644 index 000000000..71d160ede --- /dev/null +++ b/analyse/notebooks/grist/template.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -e ../../../pipeline\n", + "%pip install -e ../../../../data-inclusion-schema\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import dotenv\n", + "import pandas as pd\n", + "\n", + "from data_inclusion.scripts.tasks import grist\n", + "from data_inclusion import schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dotenv.load_dotenv(dotenv.find_dotenv())\n", + "\n", + "GRIST_API_TOKEN = os.environ[\"GRIST_API_TOKEN\"]\n", + "GRIST_API_URL = \"https://grist.incubateur.net/api\"\n", + "WORKSPACE_ID = \"27\"\n", + "DOCUMENT_NAME = \"template\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grist_client = grist.GristClient(base_url=GRIST_API_URL, token=GRIST_API_TOKEN)\n", + "\n", + "document_id = grist_client.create_document(\n", + " workspace_id=WORKSPACE_ID, document_name=DOCUMENT_NAME\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for referentiel in [\n", + " \"frais\",\n", + " \"labels_nationaux\",\n", + " \"modes_accueil\",\n", + " \"modes_orientation_accompagnateur\",\n", + " \"modes_orientation_beneficiaire\",\n", + " \"profils\",\n", + " \"thematiques\",\n", + " \"typologies_de_services\",\n", + " \"typologies_de_structures\",\n", + " \"zones_de_diffusion_types\",\n", + "]:\n", + " table_id = grist_client.create_table(\n", + " document_id=document_id,\n", + " table_name=referentiel.capitalize(),\n", + " columns=[\n", + " {\"id\": \"value\", \"fields\": {\"label\": \"valeur\", \"type\": \"Text\"}},\n", + " {\"id\": \"label\", \"fields\": {\"label\": \"label\", \"type\": \"Text\"}},\n", + " ],\n", + " )\n", + "\n", + " referentiel_df = pd.read_csv(\n", + " f\"../../../pipeline/dbt/seeds/schema/{referentiel}.csv\",\n", + " dtype=str,\n", + " )\n", + "\n", + " # attention: pas idempotent\n", + "\n", + " grist_client.add_records(\n", + " document_id=document_id,\n", + " table_id=table_id,\n", + " records=[\n", + " {\"fields\": value_dict}\n", + " for value_dict in referentiel_df[[\"value\", \"label\"]].to_dict(\n", + " orient=\"records\"\n", + " )\n", + " ],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "\n", + "def get_column_type(field) -> str:\n", + " match_referentiel = re.search(\n", + " r\"data_inclusion.schema.(?P\\w+)\", str(field.annotation)\n", + " )\n", + "\n", + " if match_referentiel is not None:\n", + " return \"Ref:\" + match_referentiel.group(\"referentiel\").capitalize()\n", + " elif \"float\" in str(field.annotation):\n", + " return \"Numeric\"\n", + " elif \"bool\" in str(field.annotation):\n", + " return \"Bool\"\n", + " elif \"date\" in str(field.annotation):\n", + " return \"DateTime:Europe/Paris\"\n", + "\n", + " return \"Text\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grist_columns = [\n", + " {\n", + " \"id\": field_name,\n", + " \"fields\": {\n", + " \"label\": field_name,\n", + " \"type\": get_column_type(field_info),\n", + " # \"visibleCol\": TODO\n", + " },\n", + " }\n", + " for field_name, field_info in schema.Structure.model_fields.items()\n", + "]\n", + "\n", + "grist_client.create_table(\n", + " document_id=document_id,\n", + " table_name=\"Structures\",\n", + " columns=grist_columns,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grist_columns = [\n", + " {\n", + " \"id\": field_name,\n", + " \"fields\": {\n", + " \"label\": field_name,\n", + " \"type\": get_column_type(field_info),\n", + " # \"visibleCol\": TODO\n", + " },\n", + " }\n", + " for field_name, field_info in schema.Service.model_fields.items()\n", + "]\n", + "\n", + "grist_client.create_table(\n", + " document_id=document_id,\n", + " table_name=\"Services\",\n", + " columns=grist_columns,\n", + ")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analyse/notebooks/reseau-alpha/.gitignore b/analyse/notebooks/reseau-alpha/.gitignore new file mode 100644 index 000000000..d1452f15d --- /dev/null +++ b/analyse/notebooks/reseau-alpha/.gitignore @@ -0,0 +1,3 @@ +structures +services +*.html \ No newline at end of file diff --git a/analyse/notebooks/reseau-alpha/extract.ipynb b/analyse/notebooks/reseau-alpha/extract.ipynb new file mode 100644 index 000000000..07722a2c9 --- /dev/null +++ b/analyse/notebooks/reseau-alpha/extract.ipynb @@ -0,0 +1,331 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scraping des structures et services publiés sur le site Web de Réseau alpha\n", + "\n", + "Le scraping commence sur cette page pour l'Essonne : https://www.reseau-alpha.org/trouver-une-formation?form%5BcodePostal%5D%5B%5D=%7C91&form%5BcriteresScolarisation%5D=&form%5BniveauLinguistiqueVise%5D=&form%5Bprogramme%5D=&form%5BmotCle%5D=\n", + "\n", + "Cette page est générée dynamiquement et Scrapy ne peut donc pas en extraire le contenu. Le HTML doit donc être extrait à la main et sauvegardé dans le même dossier que ce notebook sous le nom `structure_list.html`.\n", + "\n", + "Le script permet de scraper une copie locale du HTML pour les formations et les structures. C'est utile pour tester le script sans envoyer de requêtes au site Web original. Pour ce faire :\n", + "\n", + "1. Faire tourner au moins une fois le scrap avec RESEAU_ALPHA_TEST_W_LOCAL_FILES=0 pour télécharger le HTML depuis le site live sur l'ordinateur dans les dossiers `./structures` et `./services`\n", + "2. Set RESEAU_ALPHA_TEST_W_LOCAL_FILES=1\n", + "\n", + "### Structure du script\n", + "\n", + "1. `start_requests` démarre le scraping à partir de la page de résultats de rechercher\n", + "2. `parse` parse cette page pour extraire la liste des formations (pas encore les permanences)\n", + "3. `parse_formation` scrape le contenu de la page de chaque formation et passe le dictionnaire item à la fonction suivante\n", + "4. `parse_structure` scrape la page de la structure liée à la formation en enrichissant le dictionnaire item. Cette fonction est appelée autant de fois qu'il y a de lieux pour la formation\n", + "5. à la fin de `parse_structure`, le dictionnaire item est \"yield\" pour former une ligne du CSV (ou un objet dans l'array JSON)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import scrapy\n", + "from scrapy.crawler import CrawlerProcess\n", + "from pathlib import Path\n", + "from urllib.parse import urlparse \n", + "import re\n", + "import dateparser\n", + "import os\n", + "import dotenv\n", + "import trafilatura\n", + "\n", + "dotenv.load_dotenv(dotenv.find_dotenv())\n", + "TESTING_WITH_LOCAL_FILES = os.getenv(\"ENV_VAR\", 'False').lower() in ('true', '1', 't')\n", + "\n", + "# Local HTML\n", + "base_path = 'file://' + os.path.abspath('')\n", + "structure_base_path = base_path + '/structures'\n", + "formation_base_path = base_path + '/services'\n", + "\n", + "\n", + "\n", + "URL = f\"{base_path}/structure_list.html\"\n", + "if TESTING_WITH_LOCAL_FILES is False:\n", + " os.makedirs(structure_base_path, exist_ok=True)\n", + " os.makedirs(formation_base_path, exist_ok=True)\n", + "\n", + "# Live HTML (don't use too much to avoid being banned!)\n", + "# structure_base_url = 'https://www.reseau-alpha.org/structure/apprentissage-du-francais/'\n", + "\n", + "\n", + "# Structure avec antennes et formations : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries\n", + "# Structure sans antenne et sans formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/acafi\n", + "# Formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries/formation/francais-a-visee-professionnelle/b8a73-francais-a-visee-sociale-et-ou-professionnelle\n", + "\n", + "def html_to_markdown(s: str):\n", + " if s is None or s == \"\" :\n", + " return s\n", + " if type(s) == list:\n", + " s = \"
\".join(s)\n", + " return trafilatura.extract(trafilatura.load_html(\"\" + s + \"\"), no_fallback=True, max_tree_size=1000)\n", + "\n", + "def clean_adresse(adresses: list or scrapy.Selector) -> {} or []:\n", + " lieux = []\n", + " for adresse in adresses:\n", + " adresse_text_chunks = adresse.xpath('text()').getall()\n", + " clean_lieu = {\n", + " \"structure_service_adresse_entiere\": \"\",\n", + " \"structure_service_adresse\": \"\",\n", + " \"structure_service_code_postal\": \"\",\n", + " \"structure_service_commune\": \"\"\n", + " }\n", + " for part in adresse_text_chunks:\n", + " part = part.strip()\n", + " if re.match(r'^\\d', part):\n", + " if re.match(r'^\\d{5}', part):\n", + " split_address = part.split(\" - \")\n", + " clean_lieu[\"structure_service_code_postal\"] = split_address[0]\n", + " clean_lieu[\"structure_service_commune\"] = split_address[1]\n", + " else:\n", + " clean_lieu[\"structure_service_adresse\"] = part\n", + " clean_lieu[\"structure_service_adresse_entiere\"] += part + \", \"\n", + " lieux.append(clean_lieu)\n", + " return lieux\n", + "\n", + "def strip(maybe_string):\n", + " if type(maybe_string) == str:\n", + " return maybe_string.strip()\n", + " if maybe_string == None:\n", + " return \"\"\n", + " else:\n", + " return maybe_string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class AlphaSpider(scrapy.Spider):\n", + " name = \"alpha\"\n", + " custom_settings = {\n", + " \"DOWNLOAD_DELAY\": 0 if TESTING_WITH_LOCAL_FILES else 0.5\n", + " }\n", + "\n", + " def start_requests(self):\n", + " urls = [\n", + " URL\n", + " ]\n", + " for url in urls:\n", + " yield scrapy.Request(url=url, callback=self.parse)\n", + "\n", + " def parse(self, response):\n", + " \n", + " formations_links = response.css('div#div-accordion-formation > div.contact-content a.readon')\n", + " \n", + " if TESTING_WITH_LOCAL_FILES:\n", + " for slug in formations_links.xpath('@href').getall():\n", + " next_page = f\"{formation_base_path}/{slug.split('/')[-1]}\"\n", + " yield scrapy.Request(next_page, callback=self.parse_formation)\n", + " else:\n", + " for a in formations_links:\n", + " yield response.follow(a, callback=self.parse_formation)\n", + "\n", + "\n", + " def parse_formation(self, response):\n", + "\n", + " if TESTING_WITH_LOCAL_FILES is False:\n", + " # Downloading HTML content\n", + " page = response.url.split(\"/\")[-1]\n", + " # Path doesn't deal with file:// URIs\n", + " filepath = Path(formation_base_path[7:]) / page\n", + " filepath.write_bytes(response.body)\n", + "\n", + " formation_entete = response.css('div.entete')\n", + " formation_contenu = response.css('div.entete + div')\n", + " formation_contenu_col1 = response.css('div.entete + div > div:nth-child(1)')\n", + " formation_contenu_col2 = response.css('div.entete + div > div:nth-child(2)')\n", + " formation_inscription_info = formation_contenu_col2.css('div:nth-of-type(1)')\n", + " formation_inscription_contact = formation_contenu_col2.css('div:nth-of-type(2)')\n", + " formation_informations_pratiques = formation_contenu_col2.css('div:nth-of-type(3)')\n", + " formation_lieux_horaires = response.css('div#lieux-formation')\n", + "\n", + "\n", + " # SERVICE\n", + " item = {}\n", + "\n", + " # Nom\n", + " service_nom_1 = strip(response.css(\"div.titre-element > strong::text\").get())\n", + " service_nom_2 = strip(response.css(\"a.underline.red-alpha + div::text\").get())\n", + " item[\"nom\"] = f\"{service_nom_1} ({service_nom_2})\"\n", + "\n", + " # Date de màj\n", + " date_maj_fr = strip(response.css(\"a.underline.red-alpha + div + div::text\").get().split(\":\")[-1])\n", + " item[\"date_maj\"] = dateparser.parse(date_maj_fr).isoformat()\n", + " \n", + " # Description\n", + " contenu_objectif_public = formation_contenu_col1.css(\".row div\").getall()\n", + " contenu_objectif_public += formation_informations_pratiques.get()\n", + " # les descriptions sont très longues et rendent difficiles le test des autres champs\n", + " # item[\"presentation_detail\"] = html_to_markdown(contenu_objectif_public)\n", + "\n", + " # Lien vers la source\n", + " item[\"lien_source\"] = response.url\n", + "\n", + " # Courriel\n", + " item[\"courriel\"] = strip(formation_inscription_contact.css('div.email.red-alpha > a::attr(href)').get()).split(\":\")[-1]\n", + "\n", + " # Adresse\n", + " clean_lieux = clean_adresse(formation_lieux_horaires.css(\"div.adresse\"))\n", + "\n", + " # Téléphone\n", + " item[\"telephone\"] = \"\"\n", + " \n", + " # Contact nom prénom\n", + " item[\"contact_nom_prenom\"] = \"\"\n", + "\n", + " # Thématiques\n", + " item[\"thematiques\"] = [\"apprendre-francais--suivre-formation\"]\n", + " if service_nom_2 == \"Français à visée professionnelle\":\n", + " item[\"thematiques\"].append(\"apprendre-francais--accompagnement-insertion-pro\")\n", + " if service_nom_2 == \"Français à visée sociale et communicative\":\n", + " item[\"thematiques\"].append(\"apprendre-francais--communiquer-vie-tous-les-jours\")\n", + "\n", + " # Hard coded fields\n", + " item[\"zone_diffusion_type\"] = \"departement\"\n", + " item[\"zone_diffusion_code\"] = \"91\"\n", + " item[\"zone_diffusion_nom\"] = \"Essonne\"\n", + " item[\"types\"] = [\"formation\"]\n", + " item[\"cumulable\"] = True\n", + " item[\"contact_public\"] = True\n", + " item[\"modes_accueil\"] = [\"en-presentiel\"]\n", + "\n", + " \n", + " # STRUCTURE\n", + " # ID de la structure\n", + " structure_link_element = formation_entete.css(\"div.titre-element ~ a.underline.red-alpha\")\n", + " item[\"structure_id\"] = structure_link_element.xpath(\"@href\").get().split(\"/\")[-1]\n", + " if TESTING_WITH_LOCAL_FILES:\n", + " structure_link = f\"{structure_base_path}/{item['structure_id']}\"\n", + " else:\n", + " structure_link = structure_link_element.xpath(\"@href\").get()\n", + " \n", + " \n", + "\n", + " # Une ligne/record de service et une structure par lieu\n", + " service_id_suffix = 1\n", + " for lieu in clean_lieux:\n", + " # Id\n", + " item[\"id\"] = f\"{response.url.split('/')[-1]}_{str(service_id_suffix)}\"\n", + " service_id_suffix += 1\n", + " print(lieu)\n", + " item = item | lieu\n", + " yield scrapy.Request(structure_link, callback=self.parse_structure, meta={\"item\": item}, dont_filter=True)\n", + " \n", + " def parse_structure(self, response):\n", + " if TESTING_WITH_LOCAL_FILES is False:\n", + " # Downloading HTML content\n", + " page = response.url.split(\"/\")[-1]\n", + " # Path doesn't deal with file:// URIs\n", + " filepath = Path(structure_base_path[7:]) / page\n", + " filepath.write_bytes(response.body)\n", + "\n", + " item = response.meta.get(\"item\")\n", + " \n", + "\n", + " # Nom\n", + " item[\"structure_nom\"] = strip(response.css('div#structure > strong::text').get())\n", + "\n", + " # Data màj\n", + " item[\"structure_date_maj\"] = strip(response.css('div.structures-dates > div:nth-child(2)').xpath('text()').get())\n", + " item[\"structure_date_maj\"] = item[\"structure_date_maj\"].split(\" : \")[-1]\n", + " item[\"structure_date_maj\"] = dateparser.parse(item[\"structure_date_maj\"]).isoformat()\n", + "\n", + " # Adresse\n", + " # Sur le site Web, une structure a autant d'adresses qu'elle a de lieux pour ses services\n", + " # Certains services sont proposés sur toutes les adresses de la structure, certains non.\n", + "\n", + " # Téléphone\n", + " telephone = response.css('div.lieu div.telephone > a::attr(href)').get()\n", + " if type(telephone) == str:\n", + " # Les numéro de téléphone sont préfixés par tel:\n", + " telephone = telephone.strip()[4:]\n", + " else:\n", + " telephone = \"\"\n", + " item[\"structure_telephone\"] = telephone\n", + " \n", + " # Site Web\n", + " item[\"structure_site_web\"] = strip(response.css('div.lieu div.facebook::text').get())\n", + "\n", + " # Lien source\n", + " item[\"structure_lien_source\"] = response.url\n", + "\n", + " # Labels\n", + " item[\"structure_labels_autres\"] = [\"reseau-alpha\"]\n", + "\n", + " # Thématiques\n", + " item[\"structure_thematiques\"] = [\"apprendre-francais--suivre-formation\"]\n", + "\n", + "\n", + " yield item\n", + "\n", + " \n", + "process = CrawlerProcess(settings={\n", + " \"FEEDS\": {\n", + " # Seul le JSON est utilisable dans le pipeline car le CSV imprime les listes sans square brackets ([])\n", + " # Le CSV est pratique pour tester\n", + " \"alpha.json\": {\n", + " \"format\": \"json\",\n", + " \"overwrite\": True,\n", + " \"ensure_ascii\": False,\n", + " 'encoding': 'utf8',\n", + " 'store_empty': False,\n", + " },\n", + " \"alpha.csv\": {\n", + " \"format\": \"csv\",\n", + " \"overwrite\": True,\n", + " 'encoding': 'utf8',\n", + " },\n", + " },\n", + "})\n", + "process.crawl(AlphaSpider)\n", + "process.start()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv('./alpha.csv', dtype = str, index_col=None)\n", + "df.info()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv-analyse", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analyse/requirements.in b/analyse/requirements.in index 2bf5e3f63..c40fdaa5f 100644 --- a/analyse/requirements.in +++ b/analyse/requirements.in @@ -12,3 +12,6 @@ seaborn pyairtable pyproj minio +scrapy +dateparser +trafilatura \ No newline at end of file diff --git a/analyse/requirements.txt b/analyse/requirements.txt index 676867f34..59dc4322d 100644 --- a/analyse/requirements.txt +++ b/analyse/requirements.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with python 3.10 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: # -# pip-compile +# pip-compile --output-file=analyse/requirements.txt analyse/requirements.in # anyio==3.6.2 # via jupyter-server @@ -18,13 +18,19 @@ arrow==1.2.3 asttokens==2.2.1 # via stack-data attrs==22.2.0 - # via jsonschema + # via + # automat + # jsonschema + # service-identity + # twisted +automat==22.10.0 + # via twisted backcall==0.2.0 # via ipython beautifulsoup4==4.11.2 # via nbconvert black[jupyter]==23.1.0 - # via -r requirements.in + # via -r analyse/requirements.in bleach==6.0.0 # via nbconvert certifi==2022.12.7 @@ -32,18 +38,41 @@ certifi==2022.12.7 # minio # pyproj # requests + # trafilatura cffi==1.15.1 - # via argon2-cffi-bindings + # via + # argon2-cffi-bindings + # cryptography charset-normalizer==3.0.1 - # via requests + # via + # htmldate + # requests + # trafilatura click==8.1.3 # via black comm==0.1.2 # via ipykernel +constantly==15.1.0 + # via twisted contourpy==1.0.7 # via matplotlib +courlan==0.9.3 + # via trafilatura +cryptography==41.0.3 + # via + # pyopenssl + # scrapy + # service-identity +cssselect==1.2.0 + # via + # parsel + # scrapy cycler==0.11.0 # via matplotlib +dateparser==1.1.8 + # via + # -r analyse/requirements.in + # htmldate debugpy==1.6.6 # via ipykernel decorator==5.1.1 @@ -56,17 +85,27 @@ executing==1.2.0 # via stack-data fastjsonschema==2.16.2 # via nbformat +filelock==3.12.2 + # via tldextract fonttools==4.38.0 # via matplotlib fqdn==1.5.1 # via jsonschema greenlet==2.0.2 # via sqlalchemy +htmldate==1.4.1 + # via trafilatura +hyperlink==21.0.0 + # via twisted idna==3.4 # via # anyio + # hyperlink # jsonschema # requests + # tldextract +incremental==22.10.0 + # via twisted ipykernel==6.21.2 # via # nbclassic @@ -81,15 +120,25 @@ ipython-genutils==0.2.0 # notebook isoduration==20.11.0 # via jsonschema +itemadapter==0.8.0 + # via + # itemloaders + # scrapy +itemloaders==1.1.0 + # via scrapy jedi==0.18.2 # via ipython jinja2==3.1.2 # via - # -r requirements.in + # -r analyse/requirements.in # jupyter-server # nbclassic # nbconvert # notebook +jmespath==1.0.1 + # via + # itemloaders + # parsel jsonpointer==2.3 # via jsonschema jsonschema[format-nongpl]==4.17.3 @@ -123,22 +172,33 @@ jupyter-server-terminals==0.4.4 # via jupyter-server jupyterlab-pygments==0.2.2 # via nbconvert +justext==3.0.0 + # via trafilatura kiwisolver==1.4.4 # via matplotlib +langcodes==3.3.0 + # via courlan +lxml==4.9.3 + # via + # htmldate + # justext + # parsel + # scrapy + # trafilatura markupsafe==2.1.2 # via # jinja2 # nbconvert matplotlib==3.7.0 # via - # -r requirements.in + # -r analyse/requirements.in # seaborn matplotlib-inline==0.1.6 # via # ipykernel # ipython minio==7.1.13 - # via -r requirements.in + # via -r analyse/requirements.in mistune==2.0.5 # via nbconvert mypy-extensions==1.0.0 @@ -165,7 +225,7 @@ nest-asyncio==1.5.6 # nbclassic # notebook notebook==6.5.2 - # via -r requirements.in + # via -r analyse/requirements.in notebook-shim==0.2.2 # via nbclassic numpy==1.24.2 @@ -175,7 +235,7 @@ numpy==1.24.2 # pandas # seaborn openpyxl==3.1.1 - # via -r requirements.in + # via -r analyse/requirements.in packaging==23.0 # via # black @@ -183,12 +243,18 @@ packaging==23.0 # jupyter-server # matplotlib # nbconvert + # parsel + # scrapy pandas==1.5.3 # via - # -r requirements.in + # -r analyse/requirements.in # seaborn pandocfilters==1.5.0 # via nbconvert +parsel==1.8.1 + # via + # itemloaders + # scrapy parso==0.8.3 # via jedi pathspec==0.11.0 @@ -210,10 +276,12 @@ prometheus-client==0.16.0 # notebook prompt-toolkit==3.0.37 # via ipython +protego==0.2.1 + # via scrapy psutil==5.9.4 # via ipykernel psycopg2==2.9.5 - # via -r requirements.in + # via -r analyse/requirements.in ptyprocess==0.7.0 # via # pexpect @@ -221,30 +289,45 @@ ptyprocess==0.7.0 pure-eval==0.2.2 # via stack-data pyairtable==1.4.0 - # via -r requirements.in + # via -r analyse/requirements.in +pyasn1==0.5.0 + # via + # pyasn1-modules + # service-identity +pyasn1-modules==0.3.0 + # via service-identity pycparser==2.21 # via cffi +pydispatcher==2.0.7 + # via scrapy pygments==2.14.0 # via # ipython # nbconvert +pyopenssl==23.2.0 + # via scrapy pyparsing==3.0.9 # via matplotlib pyproj==3.4.1 - # via -r requirements.in + # via -r analyse/requirements.in pyrsistent==0.19.3 # via jsonschema python-dateutil==2.8.2 # via + # arrow + # dateparser + # htmldate # jupyter-client # matplotlib # pandas python-dotenv==0.21.1 - # via -r requirements.in + # via -r analyse/requirements.in python-json-logger==2.0.7 # via jupyter-events pytz==2022.7.1 - # via pandas + # via + # dateparser + # pandas pyyaml==6.0 # via jupyter-events pyzmq==25.0.0 @@ -254,10 +337,18 @@ pyzmq==25.0.0 # jupyter-server # nbclassic # notebook +queuelib==1.6.2 + # via scrapy +regex==2023.8.8 + # via dateparser requests==2.28.2 # via - # -r requirements.in + # -r analyse/requirements.in # pyairtable + # requests-file + # tldextract +requests-file==1.5.1 + # via tldextract rfc3339-validator==0.1.4 # via # jsonschema @@ -266,25 +357,32 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events +scrapy==2.10.0 + # via -r analyse/requirements.in seaborn==0.12.2 - # via -r requirements.in + # via -r analyse/requirements.in send2trash==1.8.0 # via # jupyter-server # nbclassic # notebook +service-identity==23.1.0 + # via scrapy six==1.16.0 # via # asttokens + # automat # bleach + # protego # python-dateutil + # requests-file # rfc3339-validator sniffio==1.3.0 # via anyio soupsieve==2.4 # via beautifulsoup4 sqlalchemy==2.0.4 - # via -r requirements.in + # via -r analyse/requirements.in stack-data==0.6.2 # via ipython terminado==0.17.1 @@ -295,6 +393,10 @@ terminado==0.17.1 # notebook tinycss2==1.2.1 # via nbconvert +tld==0.13 + # via courlan +tldextract==3.4.4 + # via scrapy tokenize-rt==5.0.0 # via black tomli==2.0.1 @@ -307,6 +409,8 @@ tornado==6.2 # nbclassic # notebook # terminado +trafilatura==1.4.1 + # via -r analyse/requirements.in traitlets==5.9.0 # via # comm @@ -322,14 +426,28 @@ traitlets==5.9.0 # nbconvert # nbformat # notebook +twisted==22.10.0 + # via scrapy typing-extensions==4.5.0 - # via sqlalchemy + # via + # sqlalchemy + # twisted +tzlocal==5.0.1 + # via dateparser uri-template==1.2.0 # via jsonschema urllib3==1.26.14 # via + # courlan + # htmldate # minio # requests + # trafilatura +w3lib==2.1.2 + # via + # itemloaders + # parsel + # scrapy wcwidth==0.2.6 # via prompt-toolkit webcolors==1.12 @@ -340,3 +458,10 @@ webencodings==0.5.1 # tinycss2 websocket-client==1.5.1 # via jupyter-server +zope-interface==6.0 + # via + # scrapy + # twisted + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/api/CONTRIBUTING.md b/api/CONTRIBUTING.md index 248c738d4..169c5596a 100644 --- a/api/CONTRIBUTING.md +++ b/api/CONTRIBUTING.md @@ -45,16 +45,16 @@ tox # 1. add/remove packages from the requirements in setup.py # 2. compile dependencies -pip-compile --resolver=backtracking --output-file=requirements/requirements.txt -pip-compile --resolver=backtracking --extra=dev --output-file=requirements/dev-requirements.txt -pip-compile --resolver=backtracking --extra=test --output-file=requirements/test-requirements.txt +pip-compile --output-file=requirements/requirements.txt +pip-compile --extra=dev --output-file=requirements/dev-requirements.txt +pip-compile --extra=test --output-file=requirements/test-requirements.txt ``` ### 2. Upgrading packages ```bash # 1. compile dependencies with the upgrade flag set -pip-compile --resolver=backtracking --upgrade --output-file=requirements/requirements.txt && \ - pip-compile --resolver=backtracking --upgrade --extra=dev --output-file=requirements/dev-requirements.txt && \ - pip-compile --resolver=backtracking --upgrade --extra=test --output-file=requirements/test-requirements.txt +pip-compile --upgrade --output-file=requirements/requirements.txt && \ + pip-compile --upgrade --extra=dev --output-file=requirements/dev-requirements.txt && \ + pip-compile --upgrade --extra=test --output-file=requirements/test-requirements.txt ``` \ No newline at end of file diff --git a/api/requirements/dev-requirements.txt b/api/requirements/dev-requirements.txt index d71fbd407..182748962 100644 --- a/api/requirements/dev-requirements.txt +++ b/api/requirements/dev-requirements.txt @@ -43,7 +43,7 @@ cryptography==41.0.2 # via # data-inclusion-api (setup.py) # python-jose -data-inclusion-schema==0.9.1 +data-inclusion-schema==0.10.0 # via data-inclusion-api (setup.py) distlib==0.3.7 # via virtualenv diff --git a/api/requirements/requirements.txt b/api/requirements/requirements.txt index 906689481..a4224777b 100644 --- a/api/requirements/requirements.txt +++ b/api/requirements/requirements.txt @@ -32,7 +32,7 @@ cryptography==41.0.2 # via # data-inclusion-api (setup.py) # python-jose -data-inclusion-schema==0.9.1 +data-inclusion-schema==0.10.0 # via data-inclusion-api (setup.py) dnspython==2.4.1 # via email-validator diff --git a/api/requirements/test-requirements.txt b/api/requirements/test-requirements.txt index 9210fc276..897e641a3 100644 --- a/api/requirements/test-requirements.txt +++ b/api/requirements/test-requirements.txt @@ -43,7 +43,7 @@ cryptography==41.0.2 # via # data-inclusion-api (setup.py) # python-jose -data-inclusion-schema==0.9.1 +data-inclusion-schema==0.10.0 # via data-inclusion-api (setup.py) dnspython==2.4.1 # via email-validator diff --git a/api/setup.py b/api/setup.py index 1e1b413c1..47525acd0 100644 --- a/api/setup.py +++ b/api/setup.py @@ -30,7 +30,7 @@ "sentry-sdk[fastapi]", "sqlalchemy", "uvicorn[standard]", - "data-inclusion-schema==0.9.1", + "data-inclusion-schema==0.10.0", ], extras_require={ "test": [ diff --git a/api/src/alembic/versions/06e3e22e0541_v0_10_0.py b/api/src/alembic/versions/06e3e22e0541_v0_10_0.py new file mode 100644 index 000000000..c6c51f5ca --- /dev/null +++ b/api/src/alembic/versions/06e3e22e0541_v0_10_0.py @@ -0,0 +1,57 @@ +"""v0.10.0 + +Revision ID: 06e3e22e0541 +Revises: 7f177bfb0108 +Create Date: 2023-09-11 15:34:37.042108 + +""" +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "06e3e22e0541" +down_revision = "7f177bfb0108" +branch_labels = None +depends_on = None + + +def column_exists(table_name, column_name): + bind = op.get_context().bind + insp = sa.inspect(bind) + columns = insp.get_columns(table_name) + return any(c["name"] == column_name for c in columns) + + +def upgrade() -> None: + op.drop_column("service", "pre_requis") + op.drop_column("service", "justificatifs") + op.add_column( + "service", + sa.Column("pre_requis", postgresql.ARRAY(sa.Text()), nullable=True), + ) + op.add_column( + "service", + sa.Column("justificatifs", postgresql.ARRAY(sa.Text()), nullable=True), + ) + + # these columns might have already been created by dbt + if not column_exists("service", "modes_orientation_accompagnateur_autres"): + op.add_column( + "service", + sa.Column( + "modes_orientation_accompagnateur_autres", sa.Text(), nullable=True + ), + ) + if not column_exists("service", "modes_orientation_beneficiaire_autres"): + op.add_column( + "service", + sa.Column( + "modes_orientation_beneficiaire_autres", sa.Text(), nullable=True + ), + ) + + +def downgrade() -> None: + pass diff --git a/api/src/data_inclusion/api/entrypoints/fastapi.py b/api/src/data_inclusion/api/entrypoints/fastapi.py index a9c39b195..05d757d2a 100644 --- a/api/src/data_inclusion/api/entrypoints/fastapi.py +++ b/api/src/data_inclusion/api/entrypoints/fastapi.py @@ -279,7 +279,7 @@ def list_services( thematique: Optional[schema.Thematique] = None, departement: Optional[schema.DepartementCOG] = None, departement_slug: Optional[schema.DepartementSlug] = None, - code_insee: Optional[schema.CodeInsee] = None, + code_insee: Optional[schema.CodeCommune] = None, ): query = ( sqla.select(models.Service) @@ -365,7 +365,7 @@ def list_services_endpoint( schema.DepartementSlug | SkipJsonSchema[None], fastapi.Query() ] = None, code_insee: Annotated[ - schema.CodeInsee | SkipJsonSchema[None], fastapi.Query() + schema.CodeCommune | SkipJsonSchema[None], fastapi.Query() ] = None, ): return list_services( @@ -429,24 +429,28 @@ def search_services( query = query.filter( sqla.or_( models.Service.zone_diffusion_type.is_(None), - models.Service.zone_diffusion_type == schema.TypeCOG.PAYS.value, + models.Service.zone_diffusion_type + == schema.ZoneDiffusionType.PAYS.value, sqla.and_( - models.Service.zone_diffusion_type == schema.TypeCOG.COMMUNE.value, + models.Service.zone_diffusion_type + == schema.ZoneDiffusionType.COMMUNE.value, models.Service.zone_diffusion_code == commune_instance.code, ), sqla.and_( - models.Service.zone_diffusion_type == schema.TypeCOG.EPCI.value, + models.Service.zone_diffusion_type + == schema.ZoneDiffusionType.EPCI.value, sqla.literal(commune_instance.siren_epci).contains( models.Service.zone_diffusion_code ), ), sqla.and_( models.Service.zone_diffusion_type - == schema.TypeCOG.DEPARTEMENT.value, + == schema.ZoneDiffusionType.DEPARTEMENT.value, models.Service.zone_diffusion_code == commune_instance.departement, ), sqla.and_( - models.Service.zone_diffusion_type == schema.TypeCOG.REGION.value, + models.Service.zone_diffusion_type + == schema.ZoneDiffusionType.REGION.value, models.Service.zone_diffusion_code == commune_instance.region, ), ) @@ -599,7 +603,7 @@ def search_services_endpoint( ), ] = None, code_insee: Annotated[ - schema.CodeInsee | SkipJsonSchema[None], + schema.CodeCommune | SkipJsonSchema[None], fastapi.Query( description="""Code insee de la commune considérée. Si fourni, les résultats inclus également les services proches de cette commune. diff --git a/api/src/data_inclusion/api/models.py b/api/src/data_inclusion/api/models.py index c2a7949a1..40fe283b3 100644 --- a/api/src/data_inclusion/api/models.py +++ b/api/src/data_inclusion/api/models.py @@ -112,9 +112,9 @@ class Service(Base): frais = sqla.Column(ARRAY(sqla.Text), default=list) frais_autres = sqla.Column(sqla.Text, nullable=True) profils = sqla.Column(ARRAY(sqla.Text), default=list) - pre_requis = sqla.Column(sqla.Text, nullable=True) + pre_requis = sqla.Column(ARRAY(sqla.Text), default=list) cumulable = sqla.Column(sqla.Boolean, default=False) - justificatifs = sqla.Column(sqla.Text, nullable=True) + justificatifs = sqla.Column(ARRAY(sqla.Text), default=list) formulaire_en_ligne = sqla.Column(sqla.Text, nullable=True) commune = sqla.Column(sqla.Text, nullable=True) code_postal = sqla.Column(sqla.Text, nullable=True) @@ -136,7 +136,9 @@ class Service(Base): date_maj = sqla.Column(sqla.Date(), nullable=True) modes_accueil = sqla.Column(ARRAY(sqla.Text), default=list) modes_orientation_accompagnateur = sqla.Column(ARRAY(sqla.Text), default=list) + modes_orientation_accompagnateur_autres = sqla.Column(sqla.Text, nullable=True) modes_orientation_beneficiaire = sqla.Column(ARRAY(sqla.Text), default=list) + modes_orientation_beneficiaire_autres = sqla.Column(sqla.Text, nullable=True) zone_diffusion_type = sqla.Column(sqla.Text, nullable=True) zone_diffusion_code = sqla.Column(sqla.Text, nullable=True) zone_diffusion_nom = sqla.Column(sqla.Text, nullable=True) diff --git a/api/src/data_inclusion/api/schema.py b/api/src/data_inclusion/api/schema.py index f7f763368..40ace967d 100644 --- a/api/src/data_inclusion/api/schema.py +++ b/api/src/data_inclusion/api/schema.py @@ -1,12 +1,19 @@ from dataclasses import dataclass from datetime import date, datetime from enum import Enum -from typing import Optional, TypeAlias +from typing import Optional from pydantic import BaseModel, ConfigDict, EmailStr, Field, StringConstraints from typing_extensions import Annotated -from data_inclusion.schema.models import ( +from data_inclusion.schema import ( + CodeCommune, + CodeDepartement, + CodeEPCI, + CodePostal, + CodeRegion, + CodeRna, + CodeSiret, Frais, LabelNational, ModeAccueil, @@ -14,9 +21,9 @@ ModeOrientationBeneficiaire, Profil, Thematique, - TypeCOG, Typologie, TypologieService, + ZoneDiffusionType, ) @@ -146,19 +153,14 @@ class _Departement: {k: departement.cog for k, departement in _departements_dict.items()}, ) -CodePostal: TypeAlias = Annotated[ - str, StringConstraints(min_length=5, max_length=5, pattern=r"^\d{5}$") -] -CodeInsee: TypeAlias = Annotated[str, StringConstraints(min_length=5, max_length=5)] - class Service(BaseModel): model_config = ConfigDict(from_attributes=True, populate_by_name=True) # internal metadata - di_geocodage_code_insee: Optional[ - Annotated[str, StringConstraints(min_length=5, max_length=5)] - ] = Field(alias="_di_geocodage_code_insee") + di_geocodage_code_insee: Optional[CodeCommune] = Field( + alias="_di_geocodage_code_insee" + ) di_geocodage_score: Optional[Annotated[float, Field(ge=0, le=1)]] = Field( alias="_di_geocodage_score" ) @@ -178,13 +180,13 @@ class Service(BaseModel): frais: Optional[list[Frais]] = None frais_autres: Optional[str] = None profils: Optional[list[Profil]] = None - pre_requis: Optional[str] = None + pre_requis: Optional[list[str]] = None cumulable: Optional[bool] = None - justificatifs: Optional[str] = None + justificatifs: Optional[list[str]] = None formulaire_en_ligne: Optional[str] = None commune: Optional[str] = None code_postal: Optional[CodePostal] = None - code_insee: Optional[CodeInsee] = None + code_insee: Optional[CodeCommune] = None adresse: Optional[str] = None complement_adresse: Optional[str] = None longitude: Optional[float] = None @@ -202,13 +204,12 @@ class Service(BaseModel): modes_orientation_accompagnateur: Optional[ list[ModeOrientationAccompagnateur] ] = None + modes_orientation_accompagnateur_autres: Optional[str] = None modes_orientation_beneficiaire: Optional[list[ModeOrientationBeneficiaire]] = None - zone_diffusion_type: Optional[TypeCOG] = None + modes_orientation_beneficiaire_autres: Optional[str] = None + zone_diffusion_type: Optional[ZoneDiffusionType] = None zone_diffusion_code: Optional[ - Annotated[str, StringConstraints(pattern=r"^\w{5}$")] # code commune - | Annotated[str, StringConstraints(pattern=r"^\d{9}$")] # code epci - | Annotated[str, StringConstraints(pattern=r"^\w{2,3}$")] # code departement - | Annotated[str, StringConstraints(pattern=r"^\d{2}$")] # code region + CodeCommune | CodeEPCI | CodeDepartement | CodeRegion ] = None zone_diffusion_nom: Optional[str] = None @@ -217,29 +218,21 @@ class Structure(BaseModel): model_config = ConfigDict(from_attributes=True, populate_by_name=True) # internal metadata - di_geocodage_code_insee: Optional[ - Annotated[str, StringConstraints(min_length=5, max_length=5)] - ] = Field(alias="_di_geocodage_code_insee") + di_geocodage_code_insee: Optional[CodeCommune] = Field( + alias="_di_geocodage_code_insee" + ) di_geocodage_score: Optional[Annotated[float, Field(ge=0, le=1)]] = Field( alias="_di_geocodage_score" ) # structure data id: str - siret: Optional[ - Annotated[ - str, StringConstraints(min_length=14, max_length=14, pattern=r"^\d{14}$") - ] - ] = None - rna: Optional[ - Annotated[ - str, StringConstraints(min_length=10, max_length=10, pattern=r"^W\d{9}$") - ] - ] = None + siret: Optional[CodeSiret] = None + rna: Optional[CodeRna] = None nom: str commune: Optional[str] = None code_postal: Optional[CodePostal] = None - code_insee: Optional[CodeInsee] = None + code_insee: Optional[CodeCommune] = None adresse: Optional[str] = None complement_adresse: Optional[str] = None longitude: Optional[float] = None diff --git a/api/src/data_inclusion/api/utils/code_officiel_geographique.py b/api/src/data_inclusion/api/utils/code_officiel_geographique.py index 3eaf03ed6..1564afa59 100644 --- a/api/src/data_inclusion/api/utils/code_officiel_geographique.py +++ b/api/src/data_inclusion/api/utils/code_officiel_geographique.py @@ -1,4 +1,5 @@ -# based on https://github.com/betagouv/dora-back/blob/main/dora/admin_express/utils.py +# based on +# https://github.com/gip-inclusion/dora-back/blob/main/dora/admin_express/utils.py CODES_ARRONDISSEMENTS_BY_CODE_COMMUNE = { # Paris diff --git a/api/tests/inclusion/factories.py b/api/tests/inclusion/factories.py index c25cb963c..7149d87bd 100644 --- a/api/tests/inclusion/factories.py +++ b/api/tests/inclusion/factories.py @@ -126,9 +126,9 @@ class Meta: ), getter=lambda l: list(map(lambda t: t.value, l)), ) - pre_requis = None + pre_requis = [] cumulable = False - justificatifs = None + justificatifs = [] formulaire_en_ligne = None commune = factory.Faker("city", locale="fr_FR") code_postal = factory.Faker("postcode") @@ -162,12 +162,14 @@ class Meta: [schema.ModeOrientationAccompagnateur.ENVOYER_UN_MAIL.value], ] ) + modes_orientation_accompagnateur_autres = None modes_orientation_beneficiaire = factory.Iterator( [ [schema.ModeOrientationBeneficiaire.TELEPHONER.value], [schema.ModeOrientationBeneficiaire.SE_PRESENTER.value], ] ) + modes_orientation_beneficiaire_autres = None zone_diffusion_type = None zone_diffusion_code = None zone_diffusion_nom = None diff --git a/api/tests/inclusion/test_api.py b/api/tests/inclusion/test_api.py index 3810bc13e..0c47a7357 100644 --- a/api/tests/inclusion/test_api.py +++ b/api/tests/inclusion/test_api.py @@ -2,7 +2,7 @@ import pytest -from data_inclusion.api import schema +from data_inclusion import schema def test_list_structures_unauthenticated(api_client): @@ -329,9 +329,9 @@ def test_list_services_all(api_client, service_factory): "frais": ["gratuit", "gratuit-sous-conditions"], "frais_autres": "Camarade il.", "profils": ["femmes", "jeunes-16-26"], - "pre_requis": None, + "pre_requis": [], "cumulable": False, - "justificatifs": None, + "justificatifs": [], "formulaire_en_ligne": None, "commune": "Sainte Jacquelineboeuf", "code_postal": "25454", @@ -351,7 +351,9 @@ def test_list_services_all(api_client, service_factory): "date_maj": "2023-01-01", "modes_accueil": ["a-distance"], "modes_orientation_accompagnateur": ["telephoner"], + "modes_orientation_accompagnateur_autres": None, "modes_orientation_beneficiaire": ["telephoner"], + "modes_orientation_beneficiaire_autres": None, "zone_diffusion_type": None, "zone_diffusion_code": None, "zone_diffusion_nom": None, @@ -813,7 +815,7 @@ def test_search_services_with_zone_diffusion_pays( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.A_DISTANCE.value], - zone_diffusion_type=schema.TypeCOG.PAYS.value, + zone_diffusion_type=schema.ZoneDiffusionType.PAYS.value, zone_diffusion_code=None, zone_diffusion_nom=None, ) @@ -844,7 +846,7 @@ def test_search_services_with_zone_diffusion_commune( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.COMMUNE.value, + zone_diffusion_type=schema.ZoneDiffusionType.COMMUNE.value, zone_diffusion_code="59183", zone_diffusion_nom="Dunkerque", ) @@ -854,7 +856,7 @@ def test_search_services_with_zone_diffusion_commune( latitude=50.633333, longitude=3.066667, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.COMMUNE.value, + zone_diffusion_type=schema.ZoneDiffusionType.COMMUNE.value, zone_diffusion_code="59350", zone_diffusion_nom="Lille", ) @@ -885,7 +887,7 @@ def test_search_services_with_zone_diffusion_epci( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.EPCI.value, + zone_diffusion_type=schema.ZoneDiffusionType.EPCI.value, zone_diffusion_code="245900428", zone_diffusion_nom="CU de Dunkerque", ) @@ -895,7 +897,7 @@ def test_search_services_with_zone_diffusion_epci( latitude=50.633333, longitude=3.066667, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.EPCI.value, + zone_diffusion_type=schema.ZoneDiffusionType.EPCI.value, zone_diffusion_code="200093201", zone_diffusion_nom="Métropole Européenne de Lille", ) @@ -926,7 +928,7 @@ def test_search_services_with_zone_diffusion_departement( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.DEPARTEMENT.value, + zone_diffusion_type=schema.ZoneDiffusionType.DEPARTEMENT.value, zone_diffusion_code="59", zone_diffusion_nom="Nord", ) @@ -936,7 +938,7 @@ def test_search_services_with_zone_diffusion_departement( latitude=50.633333, longitude=3.066667, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.DEPARTEMENT.value, + zone_diffusion_type=schema.ZoneDiffusionType.DEPARTEMENT.value, zone_diffusion_code="62", zone_diffusion_nom="Pas-de-Calais", ) @@ -967,7 +969,7 @@ def test_search_services_with_zone_diffusion_region( latitude=51.034368, longitude=2.376776, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.REGION.value, + zone_diffusion_type=schema.ZoneDiffusionType.REGION.value, zone_diffusion_code="32", zone_diffusion_nom="Nord", ) @@ -977,7 +979,7 @@ def test_search_services_with_zone_diffusion_region( latitude=50.277500, longitude=3.973400, modes_accueil=[schema.ModeAccueil.EN_PRESENTIEL.value], - zone_diffusion_type=schema.TypeCOG.REGION.value, + zone_diffusion_type=schema.ZoneDiffusionType.REGION.value, zone_diffusion_code="44", zone_diffusion_nom="Grand Est", ) diff --git a/docker-compose.yml b/docker-compose.yml index 09d027d9f..8340683a7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,7 +17,7 @@ x-airflow-common: AIRFLOW__WEBSERVER__WORKERS: 1 # Connections - AIRFLOW_CONN_S3: aws://@/data-inclusion-lake?endpoint_url=http://minio:9000&aws_access_key_id=minioadmin&aws_secret_access_key=minioadmin + AIRFLOW_CONN_S3: aws://@/data-inclusion-lake?endpoint_url=http%3A%2F%2Fminio%3A9000&aws_access_key_id=minioadmin&aws_secret_access_key=minioadmin AIRFLOW_CONN_S3_SOURCES: ${AIRFLOW_CONN_S3_SOURCES} AIRFLOW_CONN_PG: postgresql://data-inclusion:data-inclusion@target-db:5432/data-inclusion @@ -27,7 +27,8 @@ x-airflow-common: AIRFLOW_VAR_DBT_PROJECT_DIR: /opt/airflow/dbt AIRFLOW_VAR_BAN_API_URL: ${BAN_API_URL} AIRFLOW_VAR_CD35_FILE_URL: ${CD35_FILE_URL} - AIRFLOW_VAR_CD72_FILE_URL: ${CD72_FILE_URL} + AIRFLOW_VAR_CD72_STRUCTURES_FILE_URL: ${CD72_STRUCTURES_FILE_URL} + AIRFLOW_VAR_CD72_SERVICES_FILE_URL: ${CD72_SERVICES_FILE_URL} AIRFLOW_VAR_DATAGOUV_API_KEY: ${DATAGOUV_API_KEY} AIRFLOW_VAR_DATAGOUV_API_URL: ${DATAGOUV_API_URL} AIRFLOW_VAR_DATAGOUV_DI_DATASET_ID: ${DATAGOUV_DI_DATASET_ID} @@ -40,6 +41,7 @@ x-airflow-common: AIRFLOW_VAR_EMPLOIS_API_URL: ${EMPLOIS_API_URL} AIRFLOW_VAR_ETAB_PUB_FILE_URL: ${ETAB_PUB_FILE_URL} AIRFLOW_VAR_FINESS_FILE_URL: ${FINESS_FILE_URL} + AIRFLOW_VAR_GRIST_API_TOKEN: ${GRIST_API_TOKEN} AIRFLOW_VAR_IGN_ADMIN_EXPRESS_FILE_URL: ${IGN_ADMIN_EXPRESS_FILE_URL} AIRFLOW_VAR_IMMERSION_FACILITEE_S3_KEY_PREFIX: ${IMMERSION_FACILITEE_S3_KEY_PREFIX} AIRFLOW_VAR_INSEE_FIRSTNAME_FILE_URL: ${INSEE_FIRSTNAME_FILE_URL} @@ -84,6 +86,7 @@ x-airflow-common: AIRFLOW_VAR_SOLIGUIDE_API_TOKEN: ${SOLIGUIDE_API_TOKEN} AIRFLOW_VAR_SOLIGUIDE_API_URL: ${SOLIGUIDE_API_URL} AIRFLOW_VAR_UN_JEUNE_UNE_SOLUTION_API_URL: ${UN_JEUNE_UNE_SOLUTION_API_URL} + AIRFLOW_VAR_RESEAU_ALPHA_URL: ${RESEAU_ALPHA_URL} # make the data_inclusion package available in editable mode PYTHONPATH: $${PYTHONPATH}:/opt/airflow/data-inclusion/src diff --git a/pipeline/CONTRIBUTING.md b/pipeline/CONTRIBUTING.md index 5d75d51dc..a4f0f5b22 100644 --- a/pipeline/CONTRIBUTING.md +++ b/pipeline/CONTRIBUTING.md @@ -30,7 +30,7 @@ You can run dbt commands from your terminal. ```bash # install dbt -pipx install --include-deps dbt-postgres==1.4.5 +pipx install --include-deps dbt-postgres==1.6.1 # install extra dbt packages (e.g. dbt_utils) dbt deps @@ -67,6 +67,8 @@ python scripts/update_schema_seeds.py ## Project requirements +* `pip-compile~=7.3` + ### airflow These requirements are mainly used for the deployment on scalingo. @@ -80,10 +82,10 @@ To update the constraints and upgrade the requirements: ```bash # optionally bump the airflow version -export AIRFLOW_VERSION= -export PYTHON_VERSION=3.10 +AIRFLOW_VERSION= +PYTHON_VERSION=3.10 curl https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt > requirements/airflow/constraints.txt -pip-compile --resolver=backtracking --upgrade requirements/airflow/requirements.in --output-file requirements/airflow/requirements.txt +pip-compile --upgrade requirements/airflow/requirements.in ``` ### tasks @@ -99,15 +101,15 @@ To add or delete a dependency to these requirements: ```bash # 1. edit the target requirements/tasks/...../requirements.in # 2. compile the dependencies -pip-compile --resolver=backtracking requirements/tasks/dbt/requirements.in --output-file requirements/tasks/dbt/requirements.txt -pip-compile --resolver=backtracking requirements/tasks/python/requirements.in --output-file requirements/tasks/python/requirements.txt +pip-compile requirements/tasks/dbt/requirements.in +pip-compile requirements/tasks/python/requirements.in ``` To upgrade these requirements: ```bash -pip-compile --resolver=backtracking --upgrade requirements/tasks/dbt/requirements.in --output-file requirements/tasks/dbt/requirements.txt -pip-compile --resolver=backtracking --upgrade requirements/tasks/python/requirements.in --output-file requirements/tasks/python/requirements.txt +pip-compile --upgrade requirements/tasks/dbt/requirements.in +pip-compile --upgrade requirements/tasks/python/requirements.in ``` Then you should update the dev requirements. @@ -122,11 +124,11 @@ To add or delete a dependency to these dev requirements: ```bash # 1. edit the target requirements/dev/requirements.in # 2. compile the dependencies -pip-compile --resolver=backtracking requirements/dev/requirements.in --output-file requirements/dev/requirements.txt +pip-compile requirements/dev/requirements.in ``` To upgrade these requirements: ```bash -pip-compile --resolver=backtracking --upgrade requirements/dev/requirements.in --output-file requirements/dev/requirements.txt +pip-compile --upgrade requirements/dev/requirements.in ``` \ No newline at end of file diff --git a/pipeline/Dockerfile b/pipeline/Dockerfile index cc9c2649a..4206ba85e 100644 --- a/pipeline/Dockerfile +++ b/pipeline/Dockerfile @@ -41,7 +41,7 @@ RUN "${VIRTUAL_ENV}/bin/python" -m pip install --no-cache-dir -r requirements/ta ######## # This image is the runtime ######## -FROM apache/airflow:2.6.1-python3.10 +FROM apache/airflow:2.7.0-python3.10 ENV PYTHONUNBUFFERED 1 ENV PYTHONDONTWRITEBYTECODE 1 diff --git a/pipeline/dags/dags/settings.py b/pipeline/dags/dags/settings.py index 865aea880..dbabc1218 100644 --- a/pipeline/dags/dags/settings.py +++ b/pipeline/dags/dags/settings.py @@ -82,18 +82,6 @@ }, ], }, - { - "id": "cd72", - "schedule_interval": "@once", - "snapshot": False, - "streams": [ - { - "id": "rows", - "filename": "rows.xlsx", - "url": Variable.get("CD72_FILE_URL", None), - }, - ], - }, { "id": "emplois-de-linclusion", "schedule_interval": "@daily", @@ -223,6 +211,23 @@ }, ], }, + { + "id": "reseau-alpha", + "schedule_interval": "@once", + "snapshot": False, + "streams": [ + { + "id": "structures", + "filename": "structures.tar.gz", + "url": Variable.get("RESEAU_ALPHA_URL", None), + }, + { + "id": "formations", + "filename": "formations.tar.gz", + "url": Variable.get("RESEAU_ALPHA_URL", None), + }, + ], + }, { "id": "agefiph", "schedule_interval": "@daily", @@ -257,4 +262,23 @@ }, ], }, + { + "id": "cd72", + "schedule_interval": "@once", + "snapshot": False, + "streams": [ + { + "id": "structures", + "filename": "structures.csv", + "url": Variable.get("CD72_STRUCTURES_FILE_URL", None), + "token": Variable.get("GRIST_API_TOKEN", None), + }, + { + "id": "services", + "filename": "services.csv", + "url": Variable.get("CD72_SERVICES_FILE_URL", None), + "token": Variable.get("GRIST_API_TOKEN", None), + }, + ], + }, ] diff --git a/pipeline/dags/import_sources.py b/pipeline/dags/import_sources.py index 3a3a8c1f4..1d8e181b0 100644 --- a/pipeline/dags/import_sources.py +++ b/pipeline/dags/import_sources.py @@ -58,8 +58,10 @@ def _extract( from data_inclusion.scripts.tasks import ( dora, emplois_de_linclusion, + grist, mediation_numerique, mes_aides, + reseau_alpha, soliguide, utils, ) @@ -72,7 +74,7 @@ def _extract( "agefiph": utils.extract_http_content, "annuaire-du-service-public": utils.extract_http_content, "cd35": utils.extract_http_content, - "cd72": utils.extract_http_content, + "cd72": grist.extract, "data-inclusion": utils.extract_http_content, "dora": dora.extract, "emplois-de-linclusion": emplois_de_linclusion.extract, @@ -82,10 +84,16 @@ def _extract( "un-jeune-une-solution": utils.extract_http_content, "soliguide": soliguide.extract, "monenfant": utils.extract_http_content, + "reseau-alpha": { + "structures": reseau_alpha.extract_structures, + "formations": reseau_alpha.extract_formations, + }, } if source_config["id"].startswith("mediation-numerique-"): extract_fn = mediation_numerique.extract + elif isinstance(EXTRACT_FN_BY_SOURCE_ID[source_config["id"]], dict): + extract_fn = EXTRACT_FN_BY_SOURCE_ID[source_config["id"]][stream_config["id"]] else: extract_fn = EXTRACT_FN_BY_SOURCE_ID[source_config["id"]] @@ -131,6 +139,7 @@ def _load( agefiph, annuaire_du_service_public, monenfant, + reseau_alpha, soliguide, utils, ) @@ -138,7 +147,7 @@ def _load( READ_FN_BY_SOURCE_ID = { "annuaire-du-service-public": annuaire_du_service_public.read, "cd35": lambda path: utils.read_csv(path, sep=";"), - "cd72": lambda path: utils.read_excel(path, sheet_name="Structures"), + "cd72": lambda path: utils.read_csv(path, sep=","), "data-inclusion": utils.read_json, "dora": utils.read_json, "emplois-de-linclusion": utils.read_json, @@ -148,6 +157,10 @@ def _load( "un-jeune-une-solution": utils.read_json, "soliguide": soliguide.read, "monenfant": monenfant.read, + "reseau-alpha": { + "structures": reseau_alpha.read_structures, + "formations": reseau_alpha.read_formations, + }, "agefiph": { "services": agefiph.read, "structures": lambda path: utils.read_csv(path, sep=","), diff --git a/pipeline/dbt/macros/domain/checks/check_service.sql b/pipeline/dbt/macros/domain/checks/check_service.sql index 66431b9dc..d4b2d6978 100644 --- a/pipeline/dbt/macros/domain/checks/check_service.sql +++ b/pipeline/dbt/macros/domain/checks/check_service.sql @@ -11,11 +11,13 @@ CREATE OR REPLACE FUNCTION LIST_SERVICE_ERRORS( frais TEXT[], frais_autres TEXT, id TEXT, - justificatifs TEXT, + justificatifs TEXT[], lien_source TEXT, modes_accueil TEXT[], modes_orientation_accompagnateur TEXT[], + modes_orientation_accompagnateur_autres TEXT, modes_orientation_beneficiaire TEXT[], + modes_orientation_beneficiaire_autres TEXT, nom TEXT, presentation_detail TEXT, presentation_resume TEXT, @@ -30,7 +32,7 @@ CREATE OR REPLACE FUNCTION LIST_SERVICE_ERRORS( zone_diffusion_code TEXT, zone_diffusion_nom TEXT, zone_diffusion_type TEXT, - pre_requis TEXT + pre_requis TEXT[] ) RETURNS TABLE (field TEXT, value TEXT) AS $$ DECLARE @@ -50,7 +52,7 @@ BEGIN ("modes_orientation_accompagnateur", "modes_orientation_accompagnateur IS NULL OR modes_orientation_accompagnateur <@ ARRAY(SELECT m.value FROM " ~ ref('modes_orientation_accompagnateur') ~ "AS m)"), ("modes_orientation_beneficiaire", "modes_orientation_beneficiaire IS NULL OR modes_orientation_beneficiaire <@ ARRAY(SELECT m.value FROM " ~ ref('modes_orientation_beneficiaire') ~ "AS m)"), ("zone_diffusion_code", "zone_diffusion_code IS NULL OR zone_diffusion_code ~ '^(\d{9}|\w{5}|\w{2,3}|\d{2})$'"), - ("zone_diffusion_type", "zone_diffusion_type IS NULL OR zone_diffusion_type IN (SELECT t.value FROM " ~ ref('types_cog') ~ "AS t)"), + ("zone_diffusion_type", "zone_diffusion_type IS NULL OR zone_diffusion_type IN (SELECT t.value FROM " ~ ref('zones_de_diffusion_types') ~ "AS t)"), ] %} @@ -99,7 +101,9 @@ WITH final AS ( lien_source, modes_accueil, modes_orientation_accompagnateur, + modes_orientation_accompagnateur_autres, modes_orientation_beneficiaire, + modes_orientation_beneficiaire_autres, nom, presentation_detail, presentation_resume, diff --git a/pipeline/dbt/models/_sources.yml b/pipeline/dbt/models/_sources.yml index 8d1214a43..0a47d78bc 100644 --- a/pipeline/dbt/models/_sources.yml +++ b/pipeline/dbt/models/_sources.yml @@ -180,7 +180,8 @@ sources: - name: cd72 schema: cd72 tables: - - name: rows + - name: structures + - name: services - name: emplois_de_linclusion schema: emplois_de_linclusion @@ -473,4 +474,10 @@ sources: schema: agefiph tables: - name: services - - name: structures \ No newline at end of file + - name: structures + + - name: reseau_alpha + schema: reseau_alpha + tables: + - name: structures + - name: formations \ No newline at end of file diff --git a/pipeline/dbt/models/datalake.sql b/pipeline/dbt/models/datalake.sql index c77d4d451..628e09ab2 100644 --- a/pipeline/dbt/models/datalake.sql +++ b/pipeline/dbt/models/datalake.sql @@ -12,7 +12,8 @@ WITH source AS ( relations=[ source('annuaire_du_service_public', 'etablissements'), source('cd35', 'organisations'), - source('cd72', 'rows'), + source('cd72', 'structures'), + source('cd72', 'services'), source('dora', 'structures'), source('dora', 'services'), source('emplois_de_linclusion', 'siaes'), diff --git a/pipeline/dbt/models/intermediate/agefiph/_agefiph__models.yml b/pipeline/dbt/models/intermediate/agefiph/_agefiph__models.yml index 87c0ce873..8bb6471ef 100644 --- a/pipeline/dbt/models/intermediate/agefiph/_agefiph__models.yml +++ b/pipeline/dbt/models/intermediate/agefiph/_agefiph__models.yml @@ -53,4 +53,4 @@ models: - not_null - relationships: to: ref('int_agefiph__adresses') - field: id \ No newline at end of file + field: id diff --git a/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql b/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql index daadc3aa2..6e35397a0 100644 --- a/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql +++ b/pipeline/dbt/models/intermediate/agefiph/int_agefiph__services.sql @@ -54,11 +54,8 @@ final AS ( TRUE AS "contact_public", NULL AS "contact_nom_prenom", structures.courriel AS "courriel", - NULL AS "cumulable", - NULL AS "date_suspension", NULL AS "formulaire_en_ligne", NULL AS "frais_autres", - NULL AS "justificatifs", services.attributes__title AS "nom", services.attributes__field_titre_card_employeur AS "presentation_resume", NULL AS "prise_rdv", @@ -69,7 +66,12 @@ final AS ( regions."REG" AS "zone_diffusion_code", regions."LIBELLE" AS "zone_diffusion_nom", 'region' AS "zone_diffusion_type", - NULL AS "pre_requis", + NULL AS "modes_orientation_accompagnateur_autres", + NULL AS "modes_orientation_beneficiaire_autres", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "pre_requis", + CAST(NULL AS BOOLEAN) AS "cumulable", + CAST(NULL AS DATE) AS "date_suspension", 'https://www.agefiph.fr' || services.attributes__path__alias AS "lien_source", CAST(services.attributes__created AS DATE) AS "date_creation", CAST(services.attributes__changed AS DATE) AS "date_maj", diff --git a/pipeline/dbt/models/intermediate/cd35/_cd35__models.yml b/pipeline/dbt/models/intermediate/cd35/_cd35__models.yml index c50dda219..f80244d46 100644 --- a/pipeline/dbt/models/intermediate/cd35/_cd35__models.yml +++ b/pipeline/dbt/models/intermediate/cd35/_cd35__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_cd35__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_cd35__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml b/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml index 9ba61cf56..8b1df2eb2 100644 --- a/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml +++ b/pipeline/dbt/models/intermediate/cd72/_cd72__models.yml @@ -6,9 +6,51 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + + - name: int_cd72__services + tests: + - check_service: + config: + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_cd72__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_cd72__adresses') + field: id - name: int_cd72__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_cd72__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/cd72/int_cd72__adresses.sql b/pipeline/dbt/models/intermediate/cd72/int_cd72__adresses.sql index ffaa90b03..9be94f983 100644 --- a/pipeline/dbt/models/intermediate/cd72/int_cd72__adresses.sql +++ b/pipeline/dbt/models/intermediate/cd72/int_cd72__adresses.sql @@ -1,25 +1,43 @@ -WITH raw_rows AS ( - SELECT * FROM {{ ref('stg_cd72__rows') }} +WITH structures AS ( + SELECT * FROM {{ ref('stg_cd72__structures') }} ), -rows_with_id AS ( - SELECT * - FROM raw_rows - WHERE id IS NOT NULL +services AS ( + SELECT * FROM {{ ref('stg_cd72__services') }} ), -final AS ( +structure_adresses AS ( + SELECT + id AS "id", + commune AS "commune", + code_postal AS "code_postal", + NULL AS "code_insee", + adresse AS "adresse", + NULL AS "complement_adresse", + _di_source_id AS "source", + CAST(NULL AS FLOAT) AS "longitude", + CAST(NULL AS FLOAT) AS "latitude" + FROM structures +), + +service_adresses AS ( SELECT - id AS "id", - ville AS "commune", - code_postal AS "code_postal", - NULL AS "code_insee", - adresse AS "adresse", - NULL AS "complement_adresse", - NULL::FLOAT AS "longitude", - NULL::FLOAT AS "latitude", - _di_source_id AS "source" - FROM rows_with_id + id AS "id", + commune AS "commune", + code_postal AS "code_postal", + NULL AS "code_insee", + adresse AS "adresse", + NULL AS "complement_adresse", + _di_source_id AS "source", + CAST(NULL AS FLOAT) AS "longitude", + CAST(NULL AS FLOAT) AS "latitude" + FROM services +), + +final AS ( + SELECT * FROM structure_adresses + UNION ALL + SELECT * FROM service_adresses ) SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql b/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql new file mode 100644 index 000000000..f59032148 --- /dev/null +++ b/pipeline/dbt/models/intermediate/cd72/int_cd72__services.sql @@ -0,0 +1,44 @@ +WITH services AS ( + SELECT * FROM {{ ref('stg_cd72__services') }} +), + +final AS ( + SELECT + id AS "adresse_id", + TRUE AS "contact_public", + contact_nom_prenom AS "contact_nom_prenom", -- ignored for now + courriel AS "courriel", -- ignored for now + date_creation AS "date_creation", + date_maj AS "date_maj", + date_suspension AS "date_suspension", + NULL AS "formulaire_en_ligne", + frais_autres AS "frais_autres", + id AS "id", + NULL AS "lien_source", + NULL AS "modes_orientation_accompagnateur_autres", + modes_orientation_beneficiaire_autres AS "modes_orientation_beneficiaire_autres", + nom AS "nom", + presentation_resume AS "presentation_resume", + presentation_detail AS "presentation_detail", + NULL AS "prise_rdv", + profils AS "profils", + recurrence AS "recurrence", + _di_source_id AS "source", + structure_id AS "structure_id", + telephone AS "telephone", + thematiques AS "thematiques", + zone_diffusion_code AS "zone_diffusion_code", + NULL AS "zone_diffusion_nom", + zone_diffusion_type AS "zone_diffusion_type", + CAST(NULL AS BOOLEAN) AS "cumulable", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "modes_accueil", + CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", + CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", + CAST(NULL AS TEXT []) AS "types", + CASE WHEN pre_requis IS NOT NULL THEN ARRAY[pre_requis] END AS "pre_requis", + CAST(NULL AS TEXT []) AS "frais" + FROM services +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/cd72/int_cd72__structures.sql b/pipeline/dbt/models/intermediate/cd72/int_cd72__structures.sql index f790627a2..1e0eb123f 100644 --- a/pipeline/dbt/models/intermediate/cd72/int_cd72__structures.sql +++ b/pipeline/dbt/models/intermediate/cd72/int_cd72__structures.sql @@ -1,45 +1,33 @@ -WITH raw_rows AS ( - SELECT * FROM {{ ref('stg_cd72__rows') }} -), - -rows_with_id AS ( - SELECT * - FROM raw_rows - WHERE id IS NOT NULL +WITH structures AS ( + SELECT * FROM {{ ref('stg_cd72__structures') }} ), final AS ( SELECT - id AS "id", - id AS "adresse_id", - siret AS "siret", - NULL::BOOLEAN AS "antenne", - NULL AS "rna", - nom_structure AS "nom", - email_accueil AS "courriel", - site_internet AS "site_web", - _di_source_id AS "source", - NULL AS "lien_source", - horaires AS "horaires_ouverture", - NULL AS "accessibilite", - NULL::TEXT [] AS "labels_autres", - NULL::TEXT [] AS "thematiques", - NULL AS "typologie", - mise_a_jour_le::DATE AS "date_maj", - COALESCE(telephone_accueil, telephone_principal) AS "telephone", + NULL AS "accessibilite", + id AS "adresse_id", + courriel AS "courriel", + date_maj AS "date_maj", + horaires_ouverture AS "horaires_ouverture", + id AS "id", + NULL AS "lien_source", + nom AS "nom", + presentation_detail AS "presentation_detail", + NULL AS "presentation_resume", + NULL AS "rna", + siret AS "siret", + site_web AS "site_web", + _di_source_id AS "source", + telephone AS "telephone", + typologie AS "typologie", + CAST(NULL AS BOOLEAN) AS "antenne", + CAST(NULL AS TEXT []) AS "labels_autres", + CAST(NULL AS TEXT []) AS "thematiques", CASE - WHEN typologie_structure ~ 'AFPA' THEN ARRAY['afpa'] - WHEN typologie_structure ~ 'Mission Locale' THEN ARRAY['mission-locale'] - END AS "labels_nationaux", - CASE LENGTH(description) <= 280 - WHEN TRUE THEN description - WHEN FALSE THEN LEFT(description, 279) || '…' - END AS "presentation_resume", - CASE LENGTH(description) <= 280 - WHEN TRUE THEN NULL - WHEN FALSE THEN description - END AS "presentation_detail" - FROM rows_with_id + WHEN typologie = 'AFPA' THEN ARRAY['afpa'] + WHEN typologie = 'ML' THEN ARRAY['mission-locale'] + END AS "labels_nationaux" + FROM structures ) SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/data_inclusion/_data_inclusion__models.yml b/pipeline/dbt/models/intermediate/data_inclusion/_data_inclusion__models.yml index 6ebd9ba16..e25fb6a0d 100644 --- a/pipeline/dbt/models/intermediate/data_inclusion/_data_inclusion__models.yml +++ b/pipeline/dbt/models/intermediate/data_inclusion/_data_inclusion__models.yml @@ -55,4 +55,4 @@ models: - not_null - relationships: to: ref('int_data_inclusion__adresses') - field: id \ No newline at end of file + field: id diff --git a/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql b/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql index 75e77a173..0785071a0 100644 --- a/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql +++ b/pipeline/dbt/models/intermediate/data_inclusion/int_data_inclusion__services.sql @@ -17,9 +17,9 @@ di_profil_by_dora_profil AS ( final AS ( SELECT id AS "adresse_id", - contact_public AS "contact_public", - NULL AS "contact_nom_prenom", - NULL AS "courriel", + TRUE AS "contact_public", + contact_nom AS "contact_nom_prenom", + courriel AS "courriel", cumulable AS "cumulable", date_creation::DATE AS "date_creation", date_maj::DATE AS "date_maj", @@ -31,7 +31,9 @@ final AS ( NULL AS "lien_source", -- ignored modes_accueil AS "modes_accueil", NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", nom AS "nom", presentation_resume AS "presentation_resume", presentation_detail AS "presentation_detail", diff --git a/pipeline/dbt/models/intermediate/dora/_dora__models.yml b/pipeline/dbt/models/intermediate/dora/_dora__models.yml index d1c4eaa50..622db6004 100644 --- a/pipeline/dbt/models/intermediate/dora/_dora__models.yml +++ b/pipeline/dbt/models/intermediate/dora/_dora__models.yml @@ -6,15 +6,51 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_dora__services tests: - check_service: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_dora__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_dora__adresses') + field: id - name: int_dora__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_dora__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/dora/int_dora__services.sql b/pipeline/dbt/models/intermediate/dora/int_dora__services.sql index 2a08c81cb..81aaa8d37 100644 --- a/pipeline/dbt/models/intermediate/dora/int_dora__services.sql +++ b/pipeline/dbt/models/intermediate/dora/int_dora__services.sql @@ -9,49 +9,51 @@ di_profil_by_dora_profil AS ( VALUES ('Adultes', 'adultes'), ('Femmes', 'femmes'), - ('Public bénéficiaire du Revenu de Solidarité Active (RSA)', 'beneficiaire-rsa'), - ('Demandeur d''emploi', 'demandeur-demploi') + ('Public bénéficiaire du Revenu de Solidarité Active (RSA)', 'beneficiaires-rsa'), + ('Demandeur d''emploi', 'demandeurs-demploi') ) AS x (dora_profil, di_profil) ), final AS ( SELECT - id AS "adresse_id", - contact_public AS "contact_public", - NULL AS "contact_nom_prenom", -- ignored for now - NULL AS "courriel", -- ignored for now - cumulable AS "cumulable", - date_creation::DATE AS "date_creation", - date_maj::DATE AS "date_maj", - date_suspension::DATE AS "date_suspension", - formulaire_en_ligne AS "formulaire_en_ligne", - frais_autres AS "frais_autres", - id AS "id", - justificatifs AS "justificatifs", - lien_source AS "lien_source", - modes_accueil AS "modes_accueil", - NULL::TEXT [] AS "modes_orientation_accompagnateur", - NULL::TEXT [] AS "modes_orientation_beneficiaire", - nom AS "nom", - presentation_resume AS "presentation_resume", - presentation_detail AS "presentation_detail", - prise_rdv AS "prise_rdv", + id AS "adresse_id", + contact_public AS "contact_public", + NULL AS "contact_nom_prenom", -- ignored for now + NULL AS "courriel", -- ignored for now + cumulable AS "cumulable", + date_creation::DATE AS "date_creation", + date_maj::DATE AS "date_maj", + date_suspension::DATE AS "date_suspension", + formulaire_en_ligne AS "formulaire_en_ligne", + frais_autres AS "frais_autres", + id AS "id", + justificatifs AS "justificatifs", + lien_source AS "lien_source", + modes_accueil AS "modes_accueil", + modes_orientation_accompagnateur AS "modes_orientation_accompagnateur", + modes_orientation_accompagnateur_autres AS "modes_orientation_accompagnateur_autres", + modes_orientation_beneficiaire AS "modes_orientation_beneficiaire", + modes_orientation_beneficiaire_autres AS "modes_orientation_beneficiaire_autres", + nom AS "nom", + presentation_resume AS "presentation_resume", + presentation_detail AS "presentation_detail", + prise_rdv AS "prise_rdv", ARRAY( SELECT di_profil_by_dora_profil.di_profil FROM di_profil_by_dora_profil WHERE di_profil_by_dora_profil.dora_profil = ANY(services.profils) - )::TEXT [] AS "profils", - recurrence AS "recurrence", - _di_source_id AS "source", - structure_id AS "structure_id", - NULL AS "telephone", -- ignored for now - thematiques AS "thematiques", - types AS "types", - zone_diffusion_code AS "zone_diffusion_code", - zone_diffusion_nom AS "zone_diffusion_nom", - zone_diffusion_type AS "zone_diffusion_type", - pre_requis AS "pre_requis", - ARRAY[frais] AS "frais" + )::TEXT [] AS "profils", + recurrence AS "recurrence", + _di_source_id AS "source", + structure_id AS "structure_id", + NULL AS "telephone", -- ignored for now + thematiques AS "thematiques", + types AS "types", + zone_diffusion_code AS "zone_diffusion_code", + zone_diffusion_nom AS "zone_diffusion_nom", + zone_diffusion_type AS "zone_diffusion_type", + pre_requis AS "pre_requis", + ARRAY[frais] AS "frais" FROM services ) diff --git a/pipeline/dbt/models/intermediate/emplois_de_linclusion/_emplois_de_linclusion__models.yml b/pipeline/dbt/models/intermediate/emplois_de_linclusion/_emplois_de_linclusion__models.yml index 4a4ae0c77..ee5636969 100644 --- a/pipeline/dbt/models/intermediate/emplois_de_linclusion/_emplois_de_linclusion__models.yml +++ b/pipeline/dbt/models/intermediate/emplois_de_linclusion/_emplois_de_linclusion__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_emplois_de_linclusion__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_emplois_de_linclusion__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/finess/_finess__models.yml b/pipeline/dbt/models/intermediate/finess/_finess__models.yml index 2e0357fe6..db981f1ec 100644 --- a/pipeline/dbt/models/intermediate/finess/_finess__models.yml +++ b/pipeline/dbt/models/intermediate/finess/_finess__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_finess__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_finess__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/immersion_facilitee/_immersion_facilitee__models.yml b/pipeline/dbt/models/intermediate/immersion_facilitee/_immersion_facilitee__models.yml index 180cbb11d..13bcf3bed 100644 --- a/pipeline/dbt/models/intermediate/immersion_facilitee/_immersion_facilitee__models.yml +++ b/pipeline/dbt/models/intermediate/immersion_facilitee/_immersion_facilitee__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_immersion_facilitee__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_immersion_facilitee__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/immersion_facilitee/int_immersion_facilitee__adresses.sql b/pipeline/dbt/models/intermediate/immersion_facilitee/int_immersion_facilitee__adresses.sql index 978ed7a99..61f96f5e4 100644 --- a/pipeline/dbt/models/intermediate/immersion_facilitee/int_immersion_facilitee__adresses.sql +++ b/pipeline/dbt/models/intermediate/immersion_facilitee/int_immersion_facilitee__adresses.sql @@ -5,14 +5,14 @@ WITH structures AS ( final AS ( SELECT id AS "id", - NULL AS "longitude", - NULL AS "latitude", 'immersion-facilitee' AS "source", NULL AS "complement_adresse", city AS "commune", street_number_and_address AS "adresse", post_code AS "code_postal", - NULL AS "code_insee" + NULL AS "code_insee", + CAST(NULL AS FLOAT) AS "longitude", + CAST(NULL AS FLOAT) AS "latitude" FROM structures ) diff --git a/pipeline/dbt/models/intermediate/int__union_adresses.sql b/pipeline/dbt/models/intermediate/int__union_adresses.sql index 2faf5f999..13d499d00 100644 --- a/pipeline/dbt/models/intermediate/int__union_adresses.sql +++ b/pipeline/dbt/models/intermediate/int__union_adresses.sql @@ -14,6 +14,7 @@ WITH adresses AS ( ref('int_mes_aides__adresses'), ref('int_monenfant__adresses'), ref('int_odspep__adresses'), + ref('int_reseau_alpha__adresses'), ref('int_siao__adresses'), ref('int_soliguide__adresses'), ], diff --git a/pipeline/dbt/models/intermediate/int__union_services.sql b/pipeline/dbt/models/intermediate/int__union_services.sql index c9a5f4bee..b7eab9bf9 100644 --- a/pipeline/dbt/models/intermediate/int__union_services.sql +++ b/pipeline/dbt/models/intermediate/int__union_services.sql @@ -3,11 +3,13 @@ WITH services AS ( dbt_utils.union_relations( relations=[ ref('int_agefiph__services'), + ref('int_cd72__services'), ref('int_data_inclusion__services'), ref('int_dora__services'), ref('int_mediation_numerique__services'), ref('int_monenfant__services'), ref('int_odspep__services'), + ref('int_reseau_alpha__services'), ref('int_soliguide__services'), ], column_override={ @@ -17,9 +19,11 @@ WITH services AS ( "date_maj": "DATE", "date_suspension": "DATE", "frais": "TEXT[]", + "justificatifs": "TEXT[]", "modes_accueil": "TEXT[]", "modes_orientation_accompagnateur": "TEXT[]", "modes_orientation_beneficiaire": "TEXT[]", + "pre_requis": "TEXT[]", "profils": "TEXT[]", "thematiques": "TEXT[]", "types": "TEXT[]", diff --git a/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql b/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql index 2dfed33a2..9db5ac58c 100644 --- a/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql +++ b/pipeline/dbt/models/intermediate/int__union_services__enhanced.sql @@ -10,16 +10,22 @@ adresses AS ( SELECT * FROM {{ ref('int__union_adresses__enhanced') }} ), +departements AS ( + SELECT * FROM {{ source('insee', 'departements') }} +), + -- TODO: Refactoring needed to be able to do geocoding per source and then use the result in the mapping services_with_zone_diffusion AS ( SELECT {{ dbt_utils.star(from=ref('int__union_services'), relation_alias='services', except=["zone_diffusion_code", "zone_diffusion_nom"]) }}, - CASE services.source = ANY(ARRAY['monenfant', 'soliguide']) - WHEN TRUE THEN adresses.result_citycode + CASE + WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) OR services.source ~ 'mediation-numerique' THEN adresses.result_citycode + WHEN services.source = 'reseau-alpha' THEN LEFT(adresses.result_citycode, 2) ELSE services.zone_diffusion_code END AS "zone_diffusion_code", - CASE services.source = ANY(ARRAY['monenfant', 'soliguide']) - WHEN TRUE THEN adresses.commune + CASE + WHEN services.source = ANY(ARRAY['monenfant', 'soliguide']) OR services.source ~ 'mediation-numerique' THEN adresses.commune + WHEN services.source = 'reseau-alpha' THEN (SELECT departements."LIBELLE" FROM departements WHERE departements."DEP" = LEFT(adresses.result_citycode, 2)) ELSE services.zone_diffusion_nom END AS "zone_diffusion_nom" FROM @@ -52,7 +58,9 @@ valid_services AS ( lien_source, modes_accueil, modes_orientation_accompagnateur, + modes_orientation_accompagnateur_autres, modes_orientation_beneficiaire, + modes_orientation_beneficiaire_autres, nom, presentation_detail, presentation_resume, diff --git a/pipeline/dbt/models/intermediate/int__union_structures.sql b/pipeline/dbt/models/intermediate/int__union_structures.sql index 4d870dc6b..fe6e264b1 100644 --- a/pipeline/dbt/models/intermediate/int__union_structures.sql +++ b/pipeline/dbt/models/intermediate/int__union_structures.sql @@ -14,6 +14,7 @@ WITH structures AS ( ref('int_mes_aides__structures'), ref('int_monenfant__structures'), ref('int_odspep__structures'), + ref('int_reseau_alpha__structures'), ref('int_siao__structures'), ref('int_soliguide__structures'), ], diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml b/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml index 57cf1db5e..0a84c83b7 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml +++ b/pipeline/dbt/models/intermediate/mediation_numerique/_mediation_numerique_models.yml @@ -3,18 +3,63 @@ version: 2 models: - name: int_mediation_numerique__adresses tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - source + - id - check_adresse: config: severity: warn + columns: + - name: id + tests: + - not_null + - dbt_utils.not_empty_string - name: int_mediation_numerique__services tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - source + - id - check_service: config: severity: warn + columns: + - name: id + tests: + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_mediation_numerique__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_mediation_numerique__adresses') + field: id - name: int_mediation_numerique__structures tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - source + - id - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_mediation_numerique__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql index dd8b7c615..a6e828a02 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql +++ b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__services.sql @@ -111,24 +111,32 @@ final AS ( services.structure_id AS "structure_id", services.thematiques AS "thematiques", services._di_source_id AS "source", - NULL AS "pre_requis", - NULL AS "cumulable", - NULL AS "justificatifs", + CAST(NULL AS TEXT []) AS "pre_requis", + CAST(NULL AS BOOLEAN) AS "cumulable", + CAST(NULL AS TEXT []) AS "justificatifs", NULL AS "formulaire_en_ligne", NULL AS "recurrence", - NULL AS "date_creation", - NULL AS "date_suspension", + CAST(NULL AS DATE) AS "date_creation", + CAST(NULL AS DATE) AS "date_suspension", NULL AS "lien_source", structures.telephone AS "telephone", structures.courriel AS "courriel", TRUE AS "contact_public", NULL AS "contact_nom_prenom", - structures.date_maj AS "date_maj", - NULL AS "zone_diffusion_type", + CAST(structures.date_maj AS DATE) AS "date_maj", + 'departement' AS "zone_diffusion_type", NULL AS "zone_diffusion_code", NULL AS "zone_diffusion_nom", - CAST(NULL AS TEXT []) AS "modes_orientation_accompagnateur", - CAST(NULL AS TEXT []) AS "modes_orientation_beneficiaire", + ARRAY_REMOVE( + ARRAY[ + CASE WHEN structures.telephone IS NOT NULL THEN 'telephoner' END, + CASE WHEN structures.courriel IS NOT NULL THEN 'envoyer-un-mail' END + ], + NULL + ) AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", + ARRAY_REMOVE(ARRAY[CASE WHEN structures.telephone IS NOT NULL THEN 'telephoner' END], NULL) AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", CAST(NULL AS TEXT) AS "frais_autres", CASE WHEN CARDINALITY(services.types) > 0 THEN services.types ELSE ARRAY['accompagnement'] END AS "types", ARRAY['en-presentiel'] AS "modes_accueil", diff --git a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__structures.sql b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__structures.sql index e21b877a7..1534e6bc3 100644 --- a/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__structures.sql +++ b/pipeline/dbt/models/intermediate/mediation_numerique/int_mediation_numerique__structures.sql @@ -58,7 +58,7 @@ final AS ( typologie AS "typologie", presentation_resume AS "presentation_resume", {{ truncate_text("presentation_detail") }} AS "presentation_detail", - date_maj AS "date_maj", + CAST(date_maj AS DATE) AS "date_maj", _di_source_id AS "source", labels_autres AS "labels_autres", CAST(NULL AS BOOLEAN) AS "antenne" diff --git a/pipeline/dbt/models/intermediate/mes_aides/_mes_aides__models.yml b/pipeline/dbt/models/intermediate/mes_aides/_mes_aides__models.yml index d968bcc25..25e577d77 100644 --- a/pipeline/dbt/models/intermediate/mes_aides/_mes_aides__models.yml +++ b/pipeline/dbt/models/intermediate/mes_aides/_mes_aides__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_mes_aides__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_mes_aides__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml b/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml index b16800599..03c007d3e 100644 --- a/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml +++ b/pipeline/dbt/models/intermediate/monenfant/_monenfant__models.yml @@ -6,15 +6,51 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_monenfant__structures tests: - - check_service: + - check_structure: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_monenfant__adresses') + field: id - name: int_monenfant__services tests: - - check_structure: + - check_service: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_monenfant__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_monenfant__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/monenfant/int_monenfant__services.sql b/pipeline/dbt/models/intermediate/monenfant/int_monenfant__services.sql index 64e7a17de..f2ce461ed 100644 --- a/pipeline/dbt/models/intermediate/monenfant/int_monenfant__services.sql +++ b/pipeline/dbt/models/intermediate/monenfant/int_monenfant__services.sql @@ -26,9 +26,7 @@ final AS ( NULL::TEXT [] AS "profils", id AS "structure_id", _di_source_id AS "source", - NULL AS "pre_requis", TRUE AS "cumulable", - NULL AS "justificatifs", NULL AS "formulaire_en_ligne", details_infos_pratiques_jour_horaire AS "recurrence", NULL::DATE AS "date_creation", @@ -39,10 +37,14 @@ final AS ( NULL AS "contact_nom_prenom", derniere_modif_date AS "date_maj", 'commune' AS "zone_diffusion_type", - NULL AS "zone_diffusion_code", -- will be overridden after geocoding - NULL AS "zone_diffusion_nom", -- will be overridden after geocoding - NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "zone_diffusion_code", + NULL AS "zone_diffusion_nom", + NULL::TEXT [] AS "modes_orientation_accompagnateur", -- will be overridden after geocoding + NULL AS "modes_orientation_accompagnateur_autres", -- will be overridden after geocoding NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", + NULL::TEXT [] AS "pre_requis", + NULL::TEXT [] AS "justificatifs", CASE WHEN avip THEN 'Crèche À Vocation d''Insertion Professionnelle' ELSE nom END AS "nom", ARRAY['payant'] AS "frais", ARRAY['famille--garde-denfants'] AS "thematiques", diff --git a/pipeline/dbt/models/intermediate/odspep/_odspep__models.yml b/pipeline/dbt/models/intermediate/odspep/_odspep__models.yml index 64d10c05a..10c487fad 100644 --- a/pipeline/dbt/models/intermediate/odspep/_odspep__models.yml +++ b/pipeline/dbt/models/intermediate/odspep/_odspep__models.yml @@ -6,6 +6,12 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_odspep__services tests: @@ -15,14 +21,39 @@ models: columns: - name: id tests: - - not_null - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_odspep__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_odspep__adresses') + field: id - name: int_odspep__structures tests: - check_structure: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_odspep__adresses') + field: id - name: int_odspep__zones_diffusion description: This model aggregates all the *_RESSOURCE ODSPEP tables in the same table, aligned on a common set of columns @@ -43,4 +74,4 @@ models: - name: group_number description: an identifier for the group to quickly visualize groups tests: - - not_null \ No newline at end of file + - not_null diff --git a/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql b/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql index fd07fbc83..c992235d9 100644 --- a/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql +++ b/pipeline/dbt/models/intermediate/odspep/int_odspep__services.sql @@ -24,22 +24,24 @@ final AS ( NULL AS "prise_rdv", NULL::TEXT [] AS "frais", NULL AS "frais_autres", - NULL AS "pre_requis", - NULL AS "cumulable", - NULL AS "justificatifs", + NULL::TEXT [] AS "pre_requis", + NULL::BOOLEAN AS "cumulable", + NULL::TEXT [] AS "justificatifs", NULL AS "formulaire_en_ligne", NULL AS "recurrence", - NULL AS "date_creation", + NULL::DATE AS "date_creation", date_fin_valid AS "date_suspension", NULL AS "lien_source", NULL AS "telephone", NULL AS "courriel", - NULL AS "contact_public", + NULL::BOOLEAN AS "contact_public", NULL AS "contact_nom_prenom", date_derniere_modif AS "date_maj", NULL::TEXT [] AS "modes_accueil", NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", zone_diffusion_code AS "zone_diffusion_code", zone_diffusion_type AS "zone_diffusion_type", zone_diffusion_libelle AS "zone_diffusion_nom", diff --git a/pipeline/dbt/models/intermediate/odspep/int_odspep__structures.sql b/pipeline/dbt/models/intermediate/odspep/int_odspep__structures.sql index 5c2f0dd8c..21410e3b5 100644 --- a/pipeline/dbt/models/intermediate/odspep/int_odspep__structures.sql +++ b/pipeline/dbt/models/intermediate/odspep/int_odspep__structures.sql @@ -8,7 +8,7 @@ final AS ( SELECT DISTINCT ON (1) id_res AS "id", id_res AS "adresse_id", - NULL AS "antenne", + NULL::BOOLEAN AS "antenne", NULL AS "rna", 'odspep' AS "source", NULL AS "horaires_ouverture", diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/_reseau_alpha__models.yml b/pipeline/dbt/models/intermediate/reseau_alpha/_reseau_alpha__models.yml new file mode 100644 index 000000000..3e11c6fe4 --- /dev/null +++ b/pipeline/dbt/models/intermediate/reseau_alpha/_reseau_alpha__models.yml @@ -0,0 +1,56 @@ +version: 2 + +models: + - name: int_reseau_alpha__adresses + tests: + - check_adresse: + config: + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + + - name: int_reseau_alpha__services + tests: + - check_service: + config: + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_reseau_alpha__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_reseau_alpha__adresses') + field: id + + - name: int_reseau_alpha__structures + tests: + - check_structure: + config: + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_reseau_alpha__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql new file mode 100644 index 000000000..4bae773a6 --- /dev/null +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__adresses.sql @@ -0,0 +1,43 @@ +WITH structures AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__structures') }} +), + +formations AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__formations') }} +), + +structure_adresses AS ( + SELECT + _di_source_id AS "source", + adresses__longitude AS "longitude", + adresses__latitude AS "latitude", + NULL AS "complement_adresse", + adresses__ville AS "commune", + content__adresse AS "adresse", + adresses__code_postal AS "code_postal", + NULL AS "code_insee", + 'structure--' || id AS "id" + FROM structures +), + +formation_adresses AS ( + SELECT + _di_source_id AS "source", + adresses__longitude AS "longitude", + adresses__latitude AS "latitude", + NULL AS "complement_adresse", + adresses__ville AS "commune", + content__lieux_et_horaires_formation__adresse AS "adresse", + adresses__code_postal AS "code_postal", + NULL AS "code_insee", + 'service--' || id AS "id" + FROM formations +), + +final AS ( + SELECT * FROM structure_adresses + UNION ALL + SELECT * FROM formation_adresses +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql new file mode 100644 index 000000000..e94a2b76a --- /dev/null +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__services.sql @@ -0,0 +1,94 @@ +WITH formations AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__formations') }} +), + +structures AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__structures') }} +), + +final AS ( + SELECT + TRUE AS "contact_public", + formations.content__contact_inscription__contact AS "contact_nom_prenom", + formations.content__contact_inscription__courriel AS "courriel", + formations.content__inscription__informations_en_ligne AS "formulaire_en_ligne", + NULL AS "frais_autres", + formations.nom AS "nom", + NULL AS "prise_rdv", + formations.content__lieux_et_horaires_formation__horaires AS "recurrence", + formations._di_source_id AS "source", + formations.structure_id AS "structure_id", + formations.content__contact_inscription__telephone AS "telephone", + NULL AS "zone_diffusion_code", + NULL AS "zone_diffusion_nom", -- FIXME + 'departement' AS "zone_diffusion_type", + TRUE AS "cumulable", + formations.url AS "lien_source", + formations.id AS "id", + formations.content__date_maj AS "date_maj", + NULL AS "modes_orientation_accompagnateur_autres", + NULL AS "modes_orientation_beneficiaire_autres", + CASE + WHEN LENGTH(formations.content__contenu_et_objectifs__titre) <= 280 + THEN formations.content__contenu_et_objectifs__titre + ELSE LEFT(formations.content__contenu_et_objectifs__titre, 279) || '…' + END AS "presentation_resume", + ARRAY_TO_STRING( + ARRAY[ + '# Contenu et objectifs de la formation', + formations.content__contenu_et_objectifs__titre, + formations.content__contenu_et_objectifs__objectifs, + formations.content__contenu_et_objectifs__niveau, + '# Public attendu', + formations.content__public_attendu__niveau, + formations.content__public_attendu__competences, + formations.content__public_attendu__type_de_public, + '# Inscription', + formations.content__inscription__places, + formations.content__inscription__entree_sortie, + '# Informations pratiques', + formations.content__informations_pratiques__etendue, + formations.content__informations_pratiques__volume, + formations.content__informations_pratiques__cout, + formations.content__informations_pratiques__prise_en_charge, + formations.content__informations_pratiques__remuneration, + formations.content__informations_pratiques__garde + ], + E'\n\n' + ) AS "presentation_detail", + 'service--' || formations.id AS "adresse_id", + CAST(NULL AS TEXT []) AS "justificatifs", + CAST(NULL AS TEXT []) AS "pre_requis", + CAST(NULL AS DATE) AS "date_suspension", + CAST(NULL AS DATE) AS "date_creation", + ARRAY_REMOVE( + ARRAY[ + 'apprendre-francais--suivre-formation', + CASE WHEN formations.activite = 'Français à visée professionnelle' THEN 'apprendre-francais--accompagnement-insertion-pro' END, + CASE WHEN formations.activite = 'Français à visée sociale et communicative' THEN 'apprendre-francais--communiquer-vie-tous-les-jours' END + ], + NULL + ) AS "thematiques", + ARRAY['en-presentiel'] AS "modes_accueil", + ARRAY_REMOVE( + ARRAY[ + CASE WHEN formations.content__contact_inscription__courriel IS NOT NULL THEN 'envoyer-un-mail' END, + CASE WHEN formations.content__contact_inscription__telephone IS NOT NULL THEN 'telephoner' END + ], + NULL + ) AS "modes_orientation_accompagnateur", + ARRAY_REMOVE( + ARRAY[ + CASE WHEN formations.content__contact_inscription__courriel IS NOT NULL THEN 'envoyer-un-mail' END, + CASE WHEN formations.content__contact_inscription__telephone IS NOT NULL THEN 'telephoner' END + ], + NULL + ) AS "modes_orientation_beneficiaire", + ARRAY['public-langues-etrangeres'] AS "profils", + ARRAY['formation'] AS "types", + CAST(NULL AS TEXT []) AS "frais" + FROM formations + LEFT JOIN structures ON formations.structure_id = structures.id +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__structures.sql b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__structures.sql new file mode 100644 index 000000000..48faac143 --- /dev/null +++ b/pipeline/dbt/models/intermediate/reseau_alpha/int_reseau_alpha__structures.sql @@ -0,0 +1,30 @@ +WITH structures AS ( + SELECT * FROM {{ ref('stg_reseau_alpha__structures') }} +), + +final AS ( + SELECT + NULL AS "accessibilite", + content__courriel AS "courriel", + NULL AS "horaires_ouverture", + id AS "id", + url AS "lien_source", + nom AS "nom", + description AS "presentation_detail", + NULL AS "presentation_resume", + NULL AS "rna", + NULL AS "siret", + content__site_web AS "site_web", + _di_source_id AS "source", + content__telephone AS "telephone", + NULL AS "typologie", + content__date_maj AS "date_maj", + 'structure--' || id AS "adresse_id", + CAST(NULL AS BOOLEAN) AS "antenne", + CAST(NULL AS TEXT []) AS "labels_autres", + CAST(NULL AS TEXT []) AS "labels_nationaux", + CAST(NULL AS TEXT []) AS "thematiques" + FROM structures +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/siao/_siao__models.yml b/pipeline/dbt/models/intermediate/siao/_siao__models.yml index 0c79e4e50..1b340c95d 100644 --- a/pipeline/dbt/models/intermediate/siao/_siao__models.yml +++ b/pipeline/dbt/models/intermediate/siao/_siao__models.yml @@ -6,9 +6,27 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_siao__structures tests: - check_structure: config: - severity: warn \ No newline at end of file + severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_siao__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/soliguide/_soliguide__models.yml b/pipeline/dbt/models/intermediate/soliguide/_soliguide__models.yml index b91e06336..e3e1c1790 100644 --- a/pipeline/dbt/models/intermediate/soliguide/_soliguide__models.yml +++ b/pipeline/dbt/models/intermediate/soliguide/_soliguide__models.yml @@ -6,12 +6,30 @@ models: - check_adresse: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string - name: int_soliguide__structures tests: - check_structure: config: severity: warn + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_soliguide__adresses') + field: id - name: int_soliguide__services tests: @@ -23,3 +41,16 @@ models: tests: - unique - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('int_soliguide__structures') + field: id + - name: adresse_id + tests: + - not_null + - relationships: + to: ref('int_soliguide__adresses') + field: id diff --git a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql index 8c70324c4..5f0d965b0 100644 --- a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql +++ b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__services.sql @@ -91,6 +91,8 @@ open_services AS ( SELECT * FROM relevant_services WHERE + NOT close__actif + OR (close__date_debut IS NOT NULL OR close__date_fin IS NOT NULL) AND ( @@ -115,14 +117,14 @@ final AS ( NULL::TEXT [] AS "frais", NULL AS "frais_autres", NULL::TEXT [] AS "profils", - NULL AS "pre_requis", + NULL::TEXT [] AS "pre_requis", TRUE AS "cumulable", - NULL AS "justificatifs", - NULL AS "date_creation", - NULL AS "date_suspension", + NULL::TEXT [] AS "justificatifs", + NULL::DATE AS "date_creation", + NULL::DATE AS "date_suspension", filtered_phones.phone_number AS "telephone", lieux.entity_mail AS "courriel", - NULL AS "contact_public", + NULL::BOOLEAN AS "contact_public", NULL AS "contact_nom_prenom", open_services.updated_at AS "date_maj", 'commune' AS "zone_diffusion_type", @@ -131,7 +133,9 @@ final AS ( NULL AS "formulaire_en_ligne", open_services.lieu_id AS "structure_id", NULL::TEXT [] AS "modes_orientation_accompagnateur", + NULL AS "modes_orientation_accompagnateur_autres", NULL::TEXT [] AS "modes_orientation_beneficiaire", + NULL AS "modes_orientation_beneficiaire_autres", ( SELECT di_thematique_by_soliguide_categorie_code.thematique FROM di_thematique_by_soliguide_categorie_code diff --git a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__structures.sql b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__structures.sql index 5249ff9db..f48e6123c 100644 --- a/pipeline/dbt/models/intermediate/soliguide/int_soliguide__structures.sql +++ b/pipeline/dbt/models/intermediate/soliguide/int_soliguide__structures.sql @@ -12,7 +12,7 @@ final AS ( SELECT lieux.lieu_id AS "id", lieux.lieu_id AS "adresse_id", - NULL AS "antenne", + NULL::BOOLEAN AS "antenne", NULL AS "rna", 'soliguide' AS "source", NULL AS "accessibilite", diff --git a/pipeline/dbt/models/marts/api/_api_models.yml b/pipeline/dbt/models/marts/api/_api_models.yml index f0f532ded..2c9052dab 100644 --- a/pipeline/dbt/models/marts/api/_api_models.yml +++ b/pipeline/dbt/models/marts/api/_api_models.yml @@ -143,11 +143,11 @@ models: - name: profils data_type: text[] - name: pre_requis - data_type: text + data_type: text[] - name: cumulable data_type: boolean - name: justificatifs - data_type: text + data_type: text[] - name: formulaire_en_ligne data_type: text - name: commune @@ -192,8 +192,12 @@ models: data_type: text[] - name: modes_orientation_accompagnateur data_type: text[] + - name: modes_orientation_accompagnateur_autres + data_type: text - name: modes_orientation_beneficiaire data_type: text[] + - name: modes_orientation_beneficiaire_autres + data_type: text - name: zone_diffusion_type data_type: text - name: zone_diffusion_code diff --git a/pipeline/dbt/models/marts/api/api_structure.sql b/pipeline/dbt/models/marts/api/api_structure.sql index ff8bf8da6..d5acb82ae 100644 --- a/pipeline/dbt/models/marts/api/api_structure.sql +++ b/pipeline/dbt/models/marts/api/api_structure.sql @@ -9,6 +9,8 @@ final AS ( relation_alias='structures', from=ref('int__union_structures__enhanced'), except=[ + '_di_sirene_date_fermeture', + '_di_sirene_etab_successeur', '_di_adresse_surrogate_id', '_di_annotated_antenne', '_di_annotated_siret', diff --git a/pipeline/dbt/models/marts/opendata/opendata_services.sql b/pipeline/dbt/models/marts/opendata/opendata_services.sql index 425f4db1a..f6b7b577b 100644 --- a/pipeline/dbt/models/marts/opendata/opendata_services.sql +++ b/pipeline/dbt/models/marts/opendata/opendata_services.sql @@ -1,5 +1,5 @@ WITH services AS ( - SELECT * FROM {{ ref('int__union_services') }} + SELECT * FROM {{ ref('int__union_services__enhanced') }} ), final AS ( @@ -7,7 +7,7 @@ final AS ( {{ dbt_utils.star( relation_alias='services', - from=ref('int__union_services'), + from=ref('int__union_services__enhanced'), except=['courriel', 'telephone']) }}, {{ obfuscate('courriel') }} AS "courriel", diff --git a/pipeline/dbt/models/marts/opendata/opendata_structures.sql b/pipeline/dbt/models/marts/opendata/opendata_structures.sql index 296c810a1..194b62662 100644 --- a/pipeline/dbt/models/marts/opendata/opendata_structures.sql +++ b/pipeline/dbt/models/marts/opendata/opendata_structures.sql @@ -1,5 +1,5 @@ WITH structures AS ( - SELECT * FROM {{ ref('int__union_structures') }} + SELECT * FROM {{ ref('int__union_structures__enhanced') }} ), final AS ( @@ -8,7 +8,7 @@ final AS ( {{ dbt_utils.star( relation_alias='structures', - from=ref('int__union_structures'), + from=ref('int__union_structures__enhanced'), except=['courriel', 'telephone']) }}, CASE diff --git a/pipeline/dbt/models/staging/cd72/_cd72__models.yml b/pipeline/dbt/models/staging/cd72/_cd72__models.yml index f14fcb3d8..001070c7d 100644 --- a/pipeline/dbt/models/staging/cd72/_cd72__models.yml +++ b/pipeline/dbt/models/staging/cd72/_cd72__models.yml @@ -1,6 +1,28 @@ version: 2 models: - - name: stg_cd72__rows + - name: stg_cd72__structures config: - tags: cd72 \ No newline at end of file + tags: cd72 + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + + - name: stg_cd72__services + config: + tags: cd72 + columns: + - name: id + tests: + - unique + - not_null + - dbt_utils.not_empty_string + - name: structure_id + tests: + - not_null + - relationships: + to: ref('stg_cd72__structures') + field: id diff --git a/pipeline/dbt/models/staging/cd72/stg_cd72__rows.sql b/pipeline/dbt/models/staging/cd72/stg_cd72__rows.sql deleted file mode 100644 index c3a6dbf6a..000000000 --- a/pipeline/dbt/models/staging/cd72/stg_cd72__rows.sql +++ /dev/null @@ -1,26 +0,0 @@ -WITH source AS ( - SELECT * FROM {{ source('cd72', 'rows') }} -), - -final AS ( - SELECT - _di_source_id AS "_di_source_id", - data ->> 'ID Structure' AS "id", - data ->> 'ID Structure' AS "id_structure", - data ->> 'SIRET' AS "siret", - data ->> 'Nom Structure' AS "nom_structure", - data ->> 'Ville' AS "ville", - data ->> 'Code postal' AS "code_postal", - data ->> 'Adresse' AS "adresse", - data ->> 'Typologie Structure' AS "typologie_structure", - data ->> 'Téléphone accueil' AS "telephone_accueil", - data ->> 'Téléphone principal' AS "telephone_principal", - data ->> 'E-mail accueil' AS "email_accueil", - data ->> 'Site Internet' AS "site_internet", - data ->> 'Description' AS "description", - data ->> 'Mis à jour le :' AS "mise_a_jour_le", - data ->> 'Horaires' AS "horaires" - FROM source -) - -SELECT * FROM final diff --git a/pipeline/dbt/models/staging/cd72/stg_cd72__services.sql b/pipeline/dbt/models/staging/cd72/stg_cd72__services.sql new file mode 100644 index 000000000..4b68cf302 --- /dev/null +++ b/pipeline/dbt/models/staging/cd72/stg_cd72__services.sql @@ -0,0 +1,44 @@ +WITH source AS ( + SELECT * FROM {{ source('cd72', 'services') }} +), + +structures AS ( + SELECT * FROM {{ ref('stg_cd72__structures') }} +), + +final AS ( + SELECT + _di_source_id AS "_di_source_id", + data ->> 'id' AS "id", + data ->> 'nom' AS "nom", + data ->> 'lieu' AS "lieu", + data ->> 'siret' AS "siret", + -- TODO: frais, change column type from bool to ref list on grist + data ->> 'adresse' AS "adresse", + data ->> 'commune' AS "commune", + (SELECT ARRAY_AGG(TRIM(p)) FROM UNNEST(STRING_TO_ARRAY(data ->> 'profils', ',')) AS "p") AS "profils", + data ->> 'courriel' AS "courriel", + TO_DATE(data ->> 'date_maj', 'YYYY-MM-DD') AS "date_maj", + data ->> 'telephone' AS "telephone", + data ->> 'pre_requis' AS "pre_requis", + data ->> 'recurrence' AS "recurrence", + data ->> 'code_postal' AS "code_postal", + data ->> 'contact_nom_prenom' AS "contact_nom_prenom", + data ->> 'frais_autres' AS "frais_autres", + (SELECT ARRAY_AGG(TRIM(t)) FROM UNNEST(STRING_TO_ARRAY(data ->> 'thematiques', ',')) AS "t") AS "thematiques", + data ->> 'structure_id' AS "structure_id", + TO_DATE(data ->> 'date_creation', 'YYYY-MM-DD') AS "date_creation", + TO_DATE(data ->> 'date_suspension', 'YYYY-MM-DD') AS "date_suspension", + data ->> 'zone_diffusion_nom' AS "zone_diffusion_nom", + data ->> 'presentation_detail' AS "presentation_detail", + data ->> 'presentation_resume' AS "presentation_resume", + data ->> 'zone_diffusion_code' AS "zone_diffusion_code", + data ->> 'zone_diffusion_type' AS "zone_diffusion_type", + data ->> 'modes_orientation_beneficiaire_autres' AS "modes_orientation_beneficiaire_autres" + FROM source + WHERE + data ->> 'structure_id' IS NOT NULL + AND data ->> 'structure_id' IN (SELECT id FROM structures) +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/staging/cd72/stg_cd72__structures.sql b/pipeline/dbt/models/staging/cd72/stg_cd72__structures.sql new file mode 100644 index 000000000..b786b8427 --- /dev/null +++ b/pipeline/dbt/models/staging/cd72/stg_cd72__structures.sql @@ -0,0 +1,24 @@ +WITH source AS ( + SELECT * FROM {{ source('cd72', 'structures') }} +), + +final AS ( + SELECT + _di_source_id AS "_di_source_id", + data ->> 'id' AS "id", + data ->> 'nom' AS "nom", + data ->> 'siret' AS "siret", + data ->> 'adresse' AS "adresse", + data ->> 'commune' AS "commune", + data ->> 'courriel' AS "courriel", + CAST(data ->> 'date_maj' AS DATE) AS "date_maj", + data ->> 'site_web' AS "site_web", + data ->> 'telephone' AS "telephone", + data ->> 'typologie' AS "typologie", + data ->> 'code_postal' AS "code_postal", + data ->> 'horaires_ouverture' AS "horaires_ouverture", + data ->> 'presentation_detail' AS "presentation_detail" + FROM source +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/staging/data_inclusion/stg_data_inclusion__services.sql b/pipeline/dbt/models/staging/data_inclusion/stg_data_inclusion__services.sql index e3f6ef3fa..4f6fee093 100644 --- a/pipeline/dbt/models/staging/data_inclusion/stg_data_inclusion__services.sql +++ b/pipeline/dbt/models/staging/data_inclusion/stg_data_inclusion__services.sql @@ -16,8 +16,8 @@ final AS ( ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils", ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques", ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types", - NULLIF(TRIM(data ->> 'justificatifs'), '') AS "justificatifs", - NULLIF(TRIM(data ->> 'pre_requis'), '') AS "pre_requis", + STRING_TO_ARRAY(NULLIF(TRIM(data ->> 'justificatifs'), ''), ',') AS "justificatifs", + STRING_TO_ARRAY(NULLIF(TRIM(data ->> 'pre_requis'), ''), ',') AS "pre_requis", data ->> 'adresse' AS "adresse", data ->> 'code_insee' AS "code_insee", data ->> 'code_postal' AS "code_postal", diff --git a/pipeline/dbt/models/staging/dora/_dora__models.yml b/pipeline/dbt/models/staging/dora/_dora__models.yml index f0a5d4450..eec3408b4 100644 --- a/pipeline/dbt/models/staging/dora/_dora__models.yml +++ b/pipeline/dbt/models/staging/dora/_dora__models.yml @@ -87,7 +87,6 @@ models: - name: justificatifs tests: - dbt_utils.at_least_one - - dbt_utils.not_empty_string - name: latitude tests: - dbt_utils.at_least_one @@ -109,7 +108,6 @@ models: - name: pre_requis tests: - dbt_utils.at_least_one - - dbt_utils.not_empty_string - name: presentation_resume tests: - not_null diff --git a/pipeline/dbt/models/staging/dora/stg_dora__services.sql b/pipeline/dbt/models/staging/dora/stg_dora__services.sql index 18da24908..fa196b6dc 100644 --- a/pipeline/dbt/models/staging/dora/stg_dora__services.sql +++ b/pipeline/dbt/models/staging/dora/stg_dora__services.sql @@ -2,47 +2,62 @@ WITH source AS ( SELECT * FROM {{ source('dora', 'services') }} ), -final AS ( +structures AS ( + SELECT * FROM {{ ref('stg_dora__structures') }} +), + +services AS ( SELECT - _di_source_id AS "_di_source_id", - (data ->> 'contact_public')::BOOLEAN AS "contact_public", - (data ->> 'cumulable')::BOOLEAN AS "cumulable", - (data ->> 'date_creation')::TIMESTAMP WITH TIME ZONE AS "date_creation", - (data ->> 'date_maj')::TIMESTAMP WITH TIME ZONE AS "date_maj", - (data ->> 'date_suspension')::TIMESTAMP WITH TIME ZONE AS "date_suspension", - (data ->> 'latitude')::FLOAT AS "latitude", - (data ->> 'longitude')::FLOAT AS "longitude", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_accueil'))::TEXT [] AS "modes_accueil", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques", - ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types", - NULLIF(TRIM(data ->> 'justificatifs'), '') AS "justificatifs", - NULLIF(TRIM(data ->> 'pre_requis'), '') AS "pre_requis", - data ->> 'adresse' AS "adresse", - data ->> 'code_insee' AS "code_insee", - data ->> 'code_postal' AS "code_postal", - data ->> 'commune' AS "commune", - data ->> 'complement_adresse' AS "complement_adresse", - NULLIF(TRIM(data ->> 'contact_nom'), '') AS "contact_nom", - NULLIF(TRIM(data ->> 'contact_prenom'), '') AS "contact_prenom", - NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel", - data ->> 'formulaire_en_ligne' AS "formulaire_en_ligne", - data ->> 'frais_autres' AS "frais_autres", - data ->> 'frais' AS "frais", - data ->> 'id' AS "id", - data ->> 'lien_source' AS "lien_source", - data ->> 'nom' AS "nom", - data ->> 'presentation_resume' AS "presentation_resume", - data ->> 'presentation_detail' AS "presentation_detail", - data ->> 'prise_rdv' AS "prise_rdv", - data ->> 'recurrence' AS "recurrence", - data ->> 'source' AS "source", - data ->> 'structure_id' AS "structure_id", - NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone", - NULLIF(TRIM(data ->> 'zone_diffusion_code'), '') AS "zone_diffusion_code", - NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom", - data ->> 'zone_diffusion_type' AS "zone_diffusion_type" + _di_source_id AS "_di_source_id", + (data ->> 'contact_public')::BOOLEAN AS "contact_public", + (data ->> 'cumulable')::BOOLEAN AS "cumulable", + (data ->> 'date_creation')::TIMESTAMP WITH TIME ZONE AS "date_creation", + (data ->> 'date_maj')::TIMESTAMP WITH TIME ZONE AS "date_maj", + (data ->> 'date_suspension')::TIMESTAMP WITH TIME ZONE AS "date_suspension", + (data ->> 'latitude')::FLOAT AS "latitude", + (data ->> 'longitude')::FLOAT AS "longitude", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_accueil'))::TEXT [] AS "modes_accueil", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'justificatifs'))::TEXT [] AS "justificatifs", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'pre_requis'))::TEXT [] AS "pre_requis", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_accompagnateur'))::TEXT [] AS "modes_orientation_accompagnateur", + ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_beneficiaire'))::TEXT [] AS "modes_orientation_beneficiaire", + data ->> 'modes_orientation_accompagnateur_autres' AS "modes_orientation_accompagnateur_autres", + data ->> 'modes_orientation_beneficiaire_autres' AS "modes_orientation_beneficiaire_autres", + data ->> 'adresse' AS "adresse", + data ->> 'code_insee' AS "code_insee", + data ->> 'code_postal' AS "code_postal", + data ->> 'commune' AS "commune", + data ->> 'complement_adresse' AS "complement_adresse", + NULLIF(TRIM(data ->> 'contact_nom'), '') AS "contact_nom", + NULLIF(TRIM(data ->> 'contact_prenom'), '') AS "contact_prenom", + NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel", + data ->> 'formulaire_en_ligne' AS "formulaire_en_ligne", + data ->> 'frais_autres' AS "frais_autres", + data ->> 'frais' AS "frais", + data ->> 'id' AS "id", + data ->> 'lien_source' AS "lien_source", + data ->> 'nom' AS "nom", + data ->> 'presentation_resume' AS "presentation_resume", + data ->> 'presentation_detail' AS "presentation_detail", + data ->> 'prise_rdv' AS "prise_rdv", + data ->> 'recurrence' AS "recurrence", + data ->> 'source' AS "source", + data ->> 'structure_id' AS "structure_id", + NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone", + NULLIF(TRIM(data ->> 'zone_diffusion_code'), '') AS "zone_diffusion_code", + NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom", + data ->> 'zone_diffusion_type' AS "zone_diffusion_type" FROM source +), + +-- dora removes suggested structures from its api, but does not remove the associated services +-- therefore filter these orphan services +final AS ( + SELECT services.* + FROM services INNER JOIN structures ON services.structure_id = structures.id ) SELECT * FROM final diff --git a/pipeline/dbt/models/staging/odspep/stg_odspep__res_partenariales.sql b/pipeline/dbt/models/staging/odspep/stg_odspep__res_partenariales.sql index 49c08c403..3dde09610 100644 --- a/pipeline/dbt/models/staging/odspep/stg_odspep__res_partenariales.sql +++ b/pipeline/dbt/models/staging/odspep/stg_odspep__res_partenariales.sql @@ -3,7 +3,7 @@ WITH source AS ( FROM {{ source('odspep', 'DD009_RES_PARTENARIALE') }} ), -final AS ( +ressources_partenariales AS ( SELECT "ID_RES" AS "id", "ID_RES" AS "id_res", @@ -30,6 +30,12 @@ final AS ( TO_DATE("DATE_FIN_VALID_RSP", 'YYYY-MM-DD') AS "date_fin_valid", TO_DATE("DATE_DERNIERE_MODIF_RSP", 'YYYY-MM-DD') AS "date_derniere_modif" FROM source +), + +final AS ( + SELECT * + FROM ressources_partenariales + WHERE date_derniere_modif IS NOT NULL AND EXTRACT(YEAR FROM date_derniere_modif) >= 2021 ) SELECT * FROM final diff --git a/pipeline/dbt/models/staging/reseau_alpha/_reseau_alpha__models.yml b/pipeline/dbt/models/staging/reseau_alpha/_reseau_alpha__models.yml new file mode 100644 index 000000000..3aea7ab94 --- /dev/null +++ b/pipeline/dbt/models/staging/reseau_alpha/_reseau_alpha__models.yml @@ -0,0 +1,24 @@ +version: 2 + +models: + - name: stg_reseau_alpha__formations + config: + tags: reseau_alpha + columns: + - name: structure_id + tests: + - not_null + - dbt_utils.not_empty_string + - name: id + tests: + - not_null + - dbt_utils.not_empty_string + + - name: stg_reseau_alpha__structures + config: + tags: reseau_alpha + columns: + - name: id + tests: + - not_null + - dbt_utils.not_empty_string diff --git a/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql new file mode 100644 index 000000000..de753cc82 --- /dev/null +++ b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__formations.sql @@ -0,0 +1,83 @@ +WITH source AS ( + SELECT * FROM {{ source('reseau_alpha', 'formations') }} +), + +adresses AS ( + SELECT + -- extracted from cartographie.json + source.data ->> 'id' AS "formation_id", + adresses.data ->> 'ville' AS "adresses__ville", + CAST(adresses.data ->> 'latitude' AS FLOAT) AS "adresses__latitude", + CAST(adresses.data ->> 'longitude' AS FLOAT) AS "adresses__longitude", + adresses.data ->> 'codePostal' AS "adresses__code_postal", + TRIM(SUBSTRING(source.data ->> 'content__lieux_et_horaires_formation__adresse' FROM '^(.+)\s\d{5} - .+$')) AS "content__lieux_et_horaires_formation__adresse", + TRIM(source.data ->> 'content__lieux_et_horaires_formation__horaires') AS "content__lieux_et_horaires_formation__horaires" + FROM + source, + LATERAL(SELECT * FROM JSONB_PATH_QUERY(source.data, '$.adresses[*]')) AS adresses (data) + WHERE + -- a minority of formations have more than one addresses, which is not managed by + -- the data·inclusion schema. Skip these addresses. + JSONB_ARRAY_LENGTH(source.data -> 'adresses') = 1 +), + +final AS ( + SELECT + source._di_source_id AS "_di_source_id", + adresses.adresses__ville AS "adresses__ville", + adresses.adresses__latitude AS "adresses__latitude", + adresses.adresses__longitude AS "adresses__longitude", + adresses.adresses__code_postal AS "adresses__code_postal", + adresses.content__lieux_et_horaires_formation__adresse AS "content__lieux_et_horaires_formation__adresse", + adresses.content__lieux_et_horaires_formation__horaires AS "content__lieux_et_horaires_formation__horaires", + source.data ->> 'id' AS "id", + source.data ->> 'structure_id' AS "structure_id", + source.data ->> 'nom' AS "nom", + source.data ->> 'url' AS "url", + source.data ->> 'activite' AS "activite", + TO_DATE( + SUBSTRING( + ( + CASE + -- TODO: remove this after making fr_FR locale available + WHEN (source.data ->> 'content__date_maj') ~ 'janvier' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'janvier', '01') + WHEN (source.data ->> 'content__date_maj') ~ 'février' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'février', '02') + WHEN (source.data ->> 'content__date_maj') ~ 'mars' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'mars', '03') + WHEN (source.data ->> 'content__date_maj') ~ 'avril' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'avril', '04') + WHEN (source.data ->> 'content__date_maj') ~ 'mai' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'mai', '05') + WHEN (source.data ->> 'content__date_maj') ~ 'juin' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'juin', '06') + WHEN (source.data ->> 'content__date_maj') ~ 'juillet' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'juillet', '07') + WHEN (source.data ->> 'content__date_maj') ~ 'août' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'août', '08') + WHEN (source.data ->> 'content__date_maj') ~ 'septembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'septembre', '09') + WHEN (source.data ->> 'content__date_maj') ~ 'octobre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'octobre', '10') + WHEN (source.data ->> 'content__date_maj') ~ 'novembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'novembre', '11') + WHEN (source.data ->> 'content__date_maj') ~ 'décembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'décembre', '12') + END + ) FROM 'Date de la dernière modification : (.*)' + ), + 'DD MM YYYY' + ) AS "content__date_maj", + TRIM(source.data ->> 'content__contenu_et_objectifs__titre') AS "content__contenu_et_objectifs__titre", + TRIM(source.data ->> 'content__contenu_et_objectifs__objectifs') AS "content__contenu_et_objectifs__objectifs", + TRIM(source.data ->> 'content__contenu_et_objectifs__niveau') AS "content__contenu_et_objectifs__niveau", + TRIM(source.data ->> 'content__public_attendu__niveau') AS "content__public_attendu__niveau", + TRIM(source.data ->> 'content__public_attendu__competences') AS "content__public_attendu__competences", + TRIM(source.data ->> 'content__public_attendu__type_de_public') AS "content__public_attendu__type_de_public", + TRIM(source.data ->> 'content__inscription__informations_en_ligne') AS "content__inscription__informations_en_ligne", + TRIM(source.data ->> 'content__inscription__places') AS "content__inscription__places", + TRIM(source.data ->> 'content__inscription__entree_sortie') AS "content__inscription__entree_sortie", + TRIM(source.data ->> 'content__contact_inscription__adresse') AS "content__contact_inscription__adresse", + TRIM(source.data ->> 'content__contact_inscription__contact') AS "content__contact_inscription__contact", + TRIM(source.data ->> 'content__contact_inscription__telephone') AS "content__contact_inscription__telephone", + TRIM(source.data ->> 'content__contact_inscription__courriel') AS "content__contact_inscription__courriel", + TRIM(source.data ->> 'content__informations_pratiques__etendue') AS "content__informations_pratiques__etendue", + TRIM(source.data ->> 'content__informations_pratiques__volume') AS "content__informations_pratiques__volume", + TRIM(source.data ->> 'content__informations_pratiques__cout') AS "content__informations_pratiques__cout", + TRIM(source.data ->> 'content__informations_pratiques__prise_en_charge') AS "content__informations_pratiques__prise_en_charge", + TRIM(source.data ->> 'content__informations_pratiques__remuneration') AS "content__informations_pratiques__remuneration", + TRIM(source.data ->> 'content__informations_pratiques__garde') AS "content__informations_pratiques__garde" + FROM source + LEFT JOIN adresses ON source.data ->> 'id' = adresses.formation_id +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__structures.sql b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__structures.sql new file mode 100644 index 000000000..f325033b8 --- /dev/null +++ b/pipeline/dbt/models/staging/reseau_alpha/stg_reseau_alpha__structures.sql @@ -0,0 +1,67 @@ +WITH source AS ( + SELECT * FROM {{ source('reseau_alpha', 'structures') }} +), + +adresses AS ( + SELECT + -- extracted from cartographie.json + source.data ->> 'id' AS "structure_id", + adresses.data ->> 'ville' AS "adresses__ville", + CAST(adresses.data ->> 'latitude' AS FLOAT) AS "adresses__latitude", + CAST(adresses.data ->> 'longitude' AS FLOAT) AS "adresses__longitude", + adresses.data ->> 'codePostal' AS "adresses__code_postal", + TRIM(SUBSTRING(source.data ->> 'content__adresse' FROM '^(.+)\s\d{5} - .+$')) AS "content__adresse" + FROM + source, + LATERAL(SELECT * FROM JSONB_PATH_QUERY(source.data, '$.adresses[*]')) AS adresses (data) + WHERE + -- a minority of structures have more than one addresses, which is not managed by + -- the data·inclusion schema. Skip these addresses. + JSONB_ARRAY_LENGTH(source.data -> 'adresses') = 1 +), + +final AS ( + SELECT + source._di_source_id AS "_di_source_id", + adresses.adresses__ville AS "adresses__ville", + adresses.adresses__latitude AS "adresses__latitude", + adresses.adresses__longitude AS "adresses__longitude", + adresses.adresses__code_postal AS "adresses__code_postal", + adresses.content__adresse AS "content__adresse", + CAST(ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(source.data -> 'activitesFormation')) AS TEXT []) AS "activites_formation", + source.data ->> 'id' AS "id", + source.data ->> 'nom' AS "nom", + source.data ->> 'url' AS "url", + source.data ->> 'logo' AS "logo", + source.data ->> 'type' AS "type", + source.data ->> 'description' AS "description", + TO_DATE( + SUBSTRING( + ( + CASE + -- TODO: remove this after making fr_FR locale available + WHEN (source.data ->> 'content__date_maj') ~ 'janvier' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'janvier', '01') + WHEN (source.data ->> 'content__date_maj') ~ 'février' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'février', '02') + WHEN (source.data ->> 'content__date_maj') ~ 'mars' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'mars', '03') + WHEN (source.data ->> 'content__date_maj') ~ 'avril' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'avril', '04') + WHEN (source.data ->> 'content__date_maj') ~ 'mai' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'mai', '05') + WHEN (source.data ->> 'content__date_maj') ~ 'juin' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'juin', '06') + WHEN (source.data ->> 'content__date_maj') ~ 'juillet' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'juillet', '07') + WHEN (source.data ->> 'content__date_maj') ~ 'août' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'août', '08') + WHEN (source.data ->> 'content__date_maj') ~ 'septembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'septembre', '09') + WHEN (source.data ->> 'content__date_maj') ~ 'octobre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'octobre', '10') + WHEN (source.data ->> 'content__date_maj') ~ 'novembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'novembre', '11') + WHEN (source.data ->> 'content__date_maj') ~ 'décembre' THEN REGEXP_REPLACE(source.data ->> 'content__date_maj', 'décembre', '12') + END + ) FROM 'Date de la dernière modification : (.*)' + ), + 'DD MM YYYY' + ) AS "content__date_maj", + TRIM(source.data ->> 'content__telephone') AS "content__telephone", + TRIM(source.data ->> 'content__courriel') AS "content__courriel", + TRIM(source.data ->> 'content__site_web') AS "content__site_web" + FROM source + LEFT JOIN adresses ON source.data ->> 'id' = adresses.structure_id +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/staging/siao/stg_siao__etablissements.sql b/pipeline/dbt/models/staging/siao/stg_siao__etablissements.sql index 86e6e5c18..366343d86 100644 --- a/pipeline/dbt/models/staging/siao/stg_siao__etablissements.sql +++ b/pipeline/dbt/models/staging/siao/stg_siao__etablissements.sql @@ -7,7 +7,7 @@ final AS ( _di_source_id AS "_di_source_id", -- there is no proper index in the data, this is very problematic. -- for analytical use, annotate with the row number if the default ordering. - ROW_NUMBER() OVER () AS "id", + CAST(ROW_NUMBER() OVER () AS TEXT) AS "id", NULLIF(NULLIF(REGEXP_REPLACE(data ->> 'Code SIRET', '\D', '', 'g'), REPEAT('0', 14)), '') AS "code_siret", data ->> 'Nom de la structure' AS "nom_de_la_structure", data ->> 'Ville' AS "ville", diff --git a/pipeline/dbt/seeds/schema/labels_nationaux.csv b/pipeline/dbt/seeds/schema/labels_nationaux.csv index 5ef4eb7f0..e432d1c86 100644 --- a/pipeline/dbt/seeds/schema/labels_nationaux.csv +++ b/pipeline/dbt/seeds/schema/labels_nationaux.csv @@ -34,6 +34,7 @@ emmaus,Emmaus, envie,Envie, epide,EPIDE, espace-emploi-agric-arrco,Espace Emploi Agirc Arrco, +etcld,Expérimentation territoriale contre le chômage de longue durée, fabrique-de-territoire,Fabrique de Territoire, face,Fondation FACE, fede-pro-fem,Federation Professionnelle Pour les Femmes, diff --git a/pipeline/dbt/seeds/schema/profils.csv b/pipeline/dbt/seeds/schema/profils.csv index 19c25b405..11de061b9 100644 --- a/pipeline/dbt/seeds/schema/profils.csv +++ b/pipeline/dbt/seeds/schema/profils.csv @@ -1,8 +1,8 @@ value,label,description adultes,Adultes, -beneficiaire-rsa,Bénéficiaire du Revenu de Solidarité Active (RSA), +beneficiaires-rsa,Bénéficiaires du Revenu de Solidarité Active (RSA), deficience-visuelle,Déficience visuelle, -demandeur-demploi,Demandeur ou demandeuse d’emploi, +demandeurs-demploi,Demandeurs ou demandeuses d’emploi, familles-enfants,Familles/enfants, femmes,Femmes,Le lieu propose des accompagnements réservés aux femmes. handicaps-mentaux,Handicaps mentaux : déficiences limitant les activités d’une personne, diff --git a/pipeline/dbt/seeds/schema/thematiques.csv b/pipeline/dbt/seeds/schema/thematiques.csv index 674f64cb0..015227a50 100644 --- a/pipeline/dbt/seeds/schema/thematiques.csv +++ b/pipeline/dbt/seeds/schema/thematiques.csv @@ -3,7 +3,7 @@ acces-aux-droits-et-citoyennete,Accès aux droits & citoyenneté, acces-aux-droits-et-citoyennete--accompagnement-dans-les-demarches-administratives,Accompagnement dans les démarches administratives, acces-aux-droits-et-citoyennete--accompagnement-juridique,Accompagnement juridique, acces-aux-droits-et-citoyennete--aide-aux-victimes,Aide aux victimes, -acces-aux-droits-et-citoyennete--connaitre-ses-droits,Connaitre ses droits, +acces-aux-droits-et-citoyennete--connaitre-ses-droits,Connaître ses droits, acces-aux-droits-et-citoyennete--demandeurs-dasile-et-naturalisation,Demandeurs d’asile et naturalisation, acces-aux-droits-et-citoyennete--developpement-durable,Développement durable, acces-aux-droits-et-citoyennete--faciliter-laction-citoyenne,Faciliter l’action citoyenne, @@ -26,11 +26,11 @@ creation-activite--developper-son-entreprise,Développer son entreprise, creation-activite--financer-son-projet,Financer son projet, creation-activite--reseautage-pour-createurs-dentreprise,Réseautage pour créateurs d’entreprise, creation-activite--structurer-son-projet-de-creation-dentreprise,Structurer son projet de création d’entreprise, -equipement-et-alimentation,Equipement et alimentation, +equipement-et-alimentation,Équipement et alimentation, equipement-et-alimentation--acces-a-du-materiel-informatique,Accès à du matériel informatique, equipement-et-alimentation--acces-a-un-telephone-et-un-abonnement,Accès à un téléphone et un abonnement, equipement-et-alimentation--alimentation,Alimentation, -equipement-et-alimentation--electromenager,Electroménager, +equipement-et-alimentation--electromenager,Électroménager, equipement-et-alimentation--habillement,Habillement, famille,Famille, famille--accompagnement-femme-enceinte-bebe-jeune-enfant,"Accompagnement femme enceinte, bébé, jeune enfant", @@ -53,7 +53,7 @@ handicap--accompagnement-par-une-structure-specialisee,Accompagnement par une st handicap--adaptation-au-poste-de-travail,Adaptation au poste de travail, handicap--adapter-son-logement,Adapter son logement, handicap--connaissance-des-droits-des-travailleurs,Connaissance des droits des travailleurs, -handicap--faire-reconnaitre-un-handicap,Faire reconnaitre un handicap, +handicap--faire-reconnaitre-un-handicap,Faire reconnaître un handicap, handicap--favoriser-le-retour-et-le-maintien-dans-lemploi,Favoriser le retour et le maintien dans l’emploi, handicap--gerer-le-depart-a-la-retraite-des-personnes-en-situation-de-handicap,Gérer le départ à la retraite des personnes en situation de handicap, handicap--mobilite-des-personnes-en-situation-de-handicap,Mobilité des personnes en situation de handicap, @@ -72,7 +72,7 @@ logement-hebergement,Logement et hébergement, logement-hebergement--besoin-dadapter-mon-logement,Besoin d’adapter mon logement, logement-hebergement--connaissance-de-ses-droits-et-interlocuteurs,Connaissance de ses droits et interlocuteurs, logement-hebergement--demenagement,Déménagement, -logement-hebergement--etre-accompagne-pour-se-loger,Etre accompagné(e) pour se loger, +logement-hebergement--etre-accompagne-pour-se-loger,Être accompagné(e) pour se loger, logement-hebergement--gerer-son-budget,Gérer son budget, logement-hebergement--mal-loges-sans-logis,Mal logé/sans logis, logement-hebergement--probleme-avec-son-logement,Problème avec son logement, @@ -84,10 +84,10 @@ mobilite--aides-a-la-reprise-demploi-ou-a-la-formation,Aides à la reprise d’e mobilite--apprendre-a-utiliser-un-deux-roues,Apprendre à utiliser un deux roues, mobilite--comprendre-et-utiliser-les-transports-en-commun,Comprendre et utiliser les transports en commun, mobilite--entretenir-reparer-son-vehicule,Entretenir ou réparer son véhicule, -mobilite--etre-accompagne-dans-son-parcours-mobilite,Etre accompagné(e) dans son parcours mobilité, +mobilite--etre-accompagne-dans-son-parcours-mobilite,Être accompagné(e) dans son parcours mobilité, mobilite--financer-mon-projet-mobilite,Financer mon projet mobilité, mobilite--louer-un-vehicule,"Louer un véhicule (voiture, vélo, scooter..)", -mobilite--preparer-son-permis-de-conduire-se-reentrainer-a-la-conduite,"Préparer son permis de conduire, se réentrainer à la conduite", +mobilite--preparer-son-permis-de-conduire-se-reentrainer-a-la-conduite,"Préparer son permis de conduire, se réentraîner à la conduite", numerique,Numérique, numerique--acceder-a-du-materiel,Accéder à du matériel, numerique--acceder-a-une-connexion-internet,Accéder à une connexion internet, diff --git a/pipeline/dbt/seeds/schema/types_cog.csv b/pipeline/dbt/seeds/schema/zones_de_diffusion_types.csv similarity index 100% rename from pipeline/dbt/seeds/schema/types_cog.csv rename to pipeline/dbt/seeds/schema/zones_de_diffusion_types.csv diff --git a/pipeline/requirements/airflow/base.in b/pipeline/requirements/airflow/base.in index 365d31148..302eb5ca6 100644 --- a/pipeline/requirements/airflow/base.in +++ b/pipeline/requirements/airflow/base.in @@ -1,2 +1,2 @@ -apache-airflow[amazon,postgres]==2.6.1 -psycopg2 \ No newline at end of file +apache-airflow[amazon,postgres]~=2.7.0 +psycopg2~=2.9.7 \ No newline at end of file diff --git a/pipeline/requirements/airflow/constraints.txt b/pipeline/requirements/airflow/constraints.txt index c5e06c2fe..a2dab6615 100644 --- a/pipeline/requirements/airflow/constraints.txt +++ b/pipeline/requirements/airflow/constraints.txt @@ -1,6 +1,6 @@ # -# This constraints file was automatically generated on 2023-05-15T11:02:06Z -# via "eager-upgrade" mechanism of PIP. For the "v2-6-test" branch of Airflow. +# This constraints file was automatically generated on 2023-08-18T14:48:29Z +# via "eager-upgrade" mechanism of PIP. For the "v2-7-test" branch of Airflow. # This variant of constraints install uses the HEAD of the branch version for 'apache-airflow' but installs # the providers from PIP-released packages at the moment of the constraint generation. # @@ -8,159 +8,192 @@ # We also use those constraints after "apache-airflow" is released and the constraints are tagged with # "constraints-X.Y.Z" tag to build the production image for that version. # -Authlib==1.2.0 +# This constraints file is meant to be used only in the "apache-airflow" installation command and not +# in all subsequent pip commands. By using a constraints.txt file, we ensure that solely the Airflow +# installation step is reproducible. Subsequent pip commands may install packages that would have +# been incompatible with the constraints used in Airflow reproducible installation step. Finally, pip +# commands that might change the installed version of apache-airflow should include "apache-airflow==X.Y.Z" +# in the list of install targets to prevent Airflow accidental upgrade or downgrade. +# +# Typical installation process of airflow for Python 3.8 is (with random selection of extras and custom +# dependencies added), usually consists of two steps: +# +# 1. Reproducible installation of airflow with selected providers (note constraints are used): +# +# pip install "apache-airflow[celery,cncf.kubernetes,google,amazon,snowflake]==X.Y.Z" \ +# --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-X.Y.Z/constraints-3.8.txt" +# +# 2. Installing own dependencies that are potentially not matching the constraints (note constraints are not +# used, and apache-airflow==X.Y.Z is used to make sure there is no accidental airflow upgrade/downgrade. +# +# pip install "apache-airflow==X.Y.Z" "snowflake-connector-python[pandas]==2.9.0" +# +Authlib==1.2.1 Babel==2.12.1 ConfigUpdater==3.1.1 -Deprecated==1.2.13 -Flask-AppBuilder==4.3.0 +Deprecated==1.2.14 +Flask-AppBuilder==4.3.3 Flask-Babel==2.0.0 Flask-Bcrypt==1.0.1 Flask-Caching==2.0.2 -Flask-JWT-Extended==4.4.4 +Flask-JWT-Extended==4.5.2 Flask-Limiter==3.3.1 Flask-Login==0.6.2 Flask-SQLAlchemy==2.5.1 Flask-Session==0.5.0 Flask-WTF==1.1.1 Flask==2.2.5 -GitPython==3.1.31 +GitPython==3.1.32 JPype1==1.4.1 JayDeBeApi==1.2.3 Jinja2==3.1.2 Mako==1.2.4 -Markdown==3.4.3 -MarkupSafe==2.1.2 -PyGithub==1.58.2 -PyHive==0.6.5 -PyJWT==2.7.0 +Markdown==3.4.4 +MarkupSafe==2.1.3 +PyGithub==1.59.1 +PyHive==0.7.0 +PyJWT==2.8.0 PyNaCl==1.5.0 -PyYAML==6.0 -Pygments==2.15.1 +PyYAML==6.0.1 +Pygments==2.16.1 SQLAlchemy-JSONField==1.0.1.post0 SQLAlchemy-Utils==0.41.1 -SQLAlchemy==1.4.48 +SQLAlchemy==1.4.49 SecretStorage==3.3.3 +Shapely==1.8.5.post1 Sphinx==5.3.0 -Unidecode==1.3.6 WTForms==3.0.1 Werkzeug==2.2.3 adal==1.2.7 -aiobotocore==2.5.0 -aiofiles==23.1.0 -aiohttp==3.8.4 +aiobotocore==2.5.4 +aiofiles==23.2.1 +aiohttp==3.8.5 aioitertools==0.11.0 aioresponses==0.7.4 aiosignal==1.3.1 alabaster==0.7.13 -alembic==1.10.4 +alembic==1.11.3 +alibabacloud-adb20211201==1.0.0 +alibabacloud-credentials==0.3.2 +alibabacloud-endpoint-util==0.0.3 +alibabacloud-gateway-spi==0.0.1 +alibabacloud-openapi-util==0.2.1 +alibabacloud-tea-openapi==0.3.7 +alibabacloud-tea-util==0.3.11 +alibabacloud-tea-xml==0.0.2 +alibabacloud-tea==0.3.3 aliyun-python-sdk-core==2.13.36 -aliyun-python-sdk-kms==2.16.0 +aliyun-python-sdk-kms==2.16.1 amqp==5.1.1 analytics-python==1.4.post1 ansiwrap==0.8.4 -anyio==3.6.2 -apache-airflow-providers-airbyte==3.2.1 -apache-airflow-providers-alibaba==2.3.0 -apache-airflow-providers-amazon==8.0.0 -apache-airflow-providers-apache-beam==5.0.0 -apache-airflow-providers-apache-cassandra==3.1.1 -apache-airflow-providers-apache-drill==2.3.2 -apache-airflow-providers-apache-druid==3.3.1 -apache-airflow-providers-apache-flink==1.0.1 -apache-airflow-providers-apache-hdfs==3.2.1 -apache-airflow-providers-apache-hive==6.0.0 -apache-airflow-providers-apache-impala==1.0.0 -apache-airflow-providers-apache-kylin==3.1.0 -apache-airflow-providers-apache-livy==3.4.0 -apache-airflow-providers-apache-pig==4.0.0 -apache-airflow-providers-apache-pinot==4.0.1 -apache-airflow-providers-apache-spark==4.0.1 -apache-airflow-providers-apache-sqoop==3.1.1 -apache-airflow-providers-arangodb==2.1.1 -apache-airflow-providers-asana==2.1.0 -apache-airflow-providers-atlassian-jira==2.0.1 -apache-airflow-providers-celery==3.1.0 -apache-airflow-providers-cloudant==3.1.0 -apache-airflow-providers-cncf-kubernetes==6.1.0 -apache-airflow-providers-common-sql==1.4.0 -apache-airflow-providers-databricks==4.1.0 -apache-airflow-providers-datadog==3.2.0 -apache-airflow-providers-dbt-cloud==3.1.1 -apache-airflow-providers-dingding==3.1.0 -apache-airflow-providers-discord==3.1.0 -apache-airflow-providers-docker==3.6.0 -apache-airflow-providers-elasticsearch==4.4.0 -apache-airflow-providers-exasol==4.1.3 -apache-airflow-providers-facebook==3.1.0 -apache-airflow-providers-ftp==3.3.1 -apache-airflow-providers-github==2.2.1 -apache-airflow-providers-google==10.0.0 -apache-airflow-providers-grpc==3.1.0 -apache-airflow-providers-hashicorp==3.3.1 -apache-airflow-providers-http==4.3.0 -apache-airflow-providers-imap==3.1.1 -apache-airflow-providers-influxdb==2.1.0 -apache-airflow-providers-jdbc==3.3.0 -apache-airflow-providers-jenkins==3.2.1 -apache-airflow-providers-microsoft-azure==6.0.0 -apache-airflow-providers-microsoft-mssql==3.3.2 -apache-airflow-providers-microsoft-psrp==2.2.0 -apache-airflow-providers-microsoft-winrm==3.1.1 -apache-airflow-providers-mongo==3.1.1 -apache-airflow-providers-mysql==5.0.0 -apache-airflow-providers-neo4j==3.2.1 -apache-airflow-providers-odbc==3.2.1 -apache-airflow-providers-openfaas==3.1.0 -apache-airflow-providers-opsgenie==5.0.0 -apache-airflow-providers-oracle==3.6.0 -apache-airflow-providers-pagerduty==3.1.0 -apache-airflow-providers-papermill==3.1.1 -apache-airflow-providers-plexus==3.1.0 -apache-airflow-providers-postgres==5.4.0 -apache-airflow-providers-presto==5.0.0 -apache-airflow-providers-qubole==3.3.1 -apache-airflow-providers-redis==3.1.0 -apache-airflow-providers-salesforce==5.3.0 -apache-airflow-providers-samba==4.1.0 -apache-airflow-providers-segment==3.1.0 -apache-airflow-providers-sendgrid==3.1.0 -apache-airflow-providers-sftp==4.2.4 -apache-airflow-providers-singularity==3.1.0 -apache-airflow-providers-slack==7.2.0 -apache-airflow-providers-smtp==1.0.1 -apache-airflow-providers-snowflake==4.0.5 -apache-airflow-providers-sqlite==3.3.2 -apache-airflow-providers-ssh==3.6.0 -apache-airflow-providers-tableau==4.1.0 -apache-airflow-providers-tabular==1.1.0 -apache-airflow-providers-telegram==4.0.0 -apache-airflow-providers-trino==5.0.0 -apache-airflow-providers-vertica==3.3.1 -apache-airflow-providers-yandex==3.3.0 -apache-airflow-providers-zendesk==4.2.0 -apache-beam==2.46.0 -apispec==5.2.2 +anyascii==0.3.2 +anyio==3.7.1 +apache-airflow-providers-airbyte==3.3.1 +apache-airflow-providers-alibaba==2.5.1 +apache-airflow-providers-amazon==8.5.1 +apache-airflow-providers-apache-beam==5.2.1 +apache-airflow-providers-apache-cassandra==3.2.1 +apache-airflow-providers-apache-drill==2.4.3 +apache-airflow-providers-apache-druid==3.5.0 +apache-airflow-providers-apache-flink==1.1.1 +apache-airflow-providers-apache-hdfs==4.1.0 +apache-airflow-providers-apache-hive==6.1.4 +apache-airflow-providers-apache-impala==1.1.2 +apache-airflow-providers-apache-kafka==1.1.2 +apache-airflow-providers-apache-kylin==3.2.1 +apache-airflow-providers-apache-livy==3.5.2 +apache-airflow-providers-apache-pig==4.1.1 +apache-airflow-providers-apache-pinot==4.1.2 +apache-airflow-providers-apache-spark==4.1.3 +apache-airflow-providers-apache-sqoop==4.0.0 +apache-airflow-providers-apprise==1.0.1 +apache-airflow-providers-arangodb==2.2.1 +apache-airflow-providers-asana==2.2.2 +apache-airflow-providers-atlassian-jira==2.1.1 +apache-airflow-providers-celery==3.3.2 +apache-airflow-providers-cloudant==3.2.1 +apache-airflow-providers-cncf-kubernetes==7.4.2 +apache-airflow-providers-common-sql==1.7.0 +apache-airflow-providers-daskexecutor==1.0.0 +apache-airflow-providers-databricks==4.3.3 +apache-airflow-providers-datadog==3.3.1 +apache-airflow-providers-dbt-cloud==3.2.2 +apache-airflow-providers-dingding==3.2.1 +apache-airflow-providers-discord==3.3.0 +apache-airflow-providers-docker==3.7.3 +apache-airflow-providers-elasticsearch==5.0.0 +apache-airflow-providers-exasol==4.2.3 +apache-airflow-providers-facebook==3.2.1 +apache-airflow-providers-ftp==3.5.0 +apache-airflow-providers-github==2.3.1 +apache-airflow-providers-google==10.6.0 +apache-airflow-providers-grpc==3.2.1 +apache-airflow-providers-hashicorp==3.4.2 +apache-airflow-providers-http==4.5.0 +apache-airflow-providers-imap==3.3.0 +apache-airflow-providers-influxdb==2.2.1 +apache-airflow-providers-jdbc==4.0.1 +apache-airflow-providers-jenkins==3.3.1 +apache-airflow-providers-microsoft-azure==6.2.4 +apache-airflow-providers-microsoft-mssql==3.4.2 +apache-airflow-providers-microsoft-psrp==2.3.1 +apache-airflow-providers-microsoft-winrm==3.2.1 +apache-airflow-providers-mongo==3.2.1 +apache-airflow-providers-mysql==5.2.1 +apache-airflow-providers-neo4j==3.3.2 +apache-airflow-providers-odbc==4.0.0 +apache-airflow-providers-openfaas==3.2.1 +apache-airflow-providers-openlineage==1.0.1 +apache-airflow-providers-opsgenie==5.1.1 +apache-airflow-providers-oracle==3.7.2 +apache-airflow-providers-pagerduty==3.3.0 +apache-airflow-providers-papermill==3.2.1 +apache-airflow-providers-plexus==3.2.1 +apache-airflow-providers-postgres==5.6.0 +apache-airflow-providers-presto==5.1.2 +apache-airflow-providers-qubole==3.4.2 +apache-airflow-providers-redis==3.3.1 +apache-airflow-providers-salesforce==5.4.1 +apache-airflow-providers-samba==4.2.1 +apache-airflow-providers-segment==3.2.1 +apache-airflow-providers-sendgrid==3.2.1 +apache-airflow-providers-sftp==4.5.0 +apache-airflow-providers-singularity==3.2.1 +apache-airflow-providers-slack==7.3.2 +apache-airflow-providers-smtp==1.3.0 +apache-airflow-providers-snowflake==4.4.2 +apache-airflow-providers-sqlite==3.4.3 +apache-airflow-providers-ssh==3.7.1 +apache-airflow-providers-tableau==4.2.1 +apache-airflow-providers-tabular==1.2.1 +apache-airflow-providers-telegram==4.1.1 +apache-airflow-providers-trino==5.2.1 +apache-airflow-providers-vertica==3.5.1 +apache-airflow-providers-zendesk==4.3.1 +apache-beam==2.49.0 +apispec==6.3.0 appdirs==1.4.4 -argcomplete==3.0.8 +apprise==1.4.5 +argcomplete==3.1.1 arrow==1.2.3 asana==3.2.1 -asgiref==3.6.0 +asgiref==3.7.2 asn1crypto==1.5.1 -astroid==2.15.5 +astroid==2.15.6 asttokens==2.2.1 -async-timeout==4.0.2 -asynctest==0.13.0 +async-timeout==4.0.3 atlasclient==1.0.0 -atlassian-python-api==3.36.0 +atlassian-python-api==3.41.0 attrs==23.1.0 -aws-sam-translator==1.66.0 +aws-sam-translator==1.73.0 aws-xray-sdk==2.12.0 -azure-batch==13.0.0 +azure-batch==14.0.0 azure-common==1.1.28 -azure-core==1.26.4 -azure-cosmos==4.3.1 +azure-core==1.29.2 +azure-cosmos==4.5.0 azure-datalake-store==0.0.53 -azure-identity==1.13.0 +azure-identity==1.14.0 azure-keyvault-secrets==4.7.0 azure-kusto-data==0.0.45 azure-mgmt-containerinstance==1.5.0 @@ -169,249 +202,256 @@ azure-mgmt-datafactory==1.1.0 azure-mgmt-datalake-nspkg==3.0.1 azure-mgmt-datalake-store==0.5.0 azure-mgmt-nspkg==3.0.2 -azure-mgmt-resource==23.0.0 +azure-mgmt-resource==23.0.1 azure-nspkg==3.0.2 -azure-servicebus==7.10.0 -azure-storage-blob==12.16.0 +azure-servicebus==7.11.1 +azure-storage-blob==12.17.0 azure-storage-common==2.1.0 -azure-storage-file-datalake==12.11.0 +azure-storage-file-datalake==12.12.0 azure-storage-file==2.1.0 azure-synapse-spark==0.7.0 backcall==0.2.0 backoff==1.10.0 bcrypt==4.0.1 beautifulsoup4==4.12.2 -billiard==3.6.4.0 -bitarray==2.7.3 -black==23.1a1 +billiard==4.1.0 +bitarray==2.8.1 +black==23.7.0 bleach==6.0.0 blinker==1.6.2 -boto3==1.26.76 +boto3==1.28.17 boto==2.49.0 -botocore==1.29.76 +botocore==1.31.17 bowler==0.9.0 cachelib==0.9.0 -cachetools==5.3.0 -cassandra-driver==3.27.0 -cattrs==22.2.0 -celery==5.2.7 -certifi==2023.5.7 +cachetools==5.3.1 +cassandra-driver==3.28.0 +cattrs==23.1.2 +celery==5.3.1 +certifi==2023.7.22 cffi==1.15.1 -cfgv==3.3.1 -cfn-lint==0.77.5 +cfgv==3.4.0 +cfn-lint==0.77.10 cgroupspy==0.2.2 -chardet==5.1.0 -charset-normalizer==2.1.1 +chardet==5.2.0 +charset-normalizer==3.2.0 checksumdir==1.2.0 ciso8601==2.3.0 -click-default-group==1.2.2 +click-default-group==1.2.4 click-didyoumean==0.3.0 click-plugins==1.1.1 -click-repl==0.2.0 -click==8.1.3 +click-repl==0.3.0 +click==8.1.7 clickclick==20.10.2 cloudant==2.15.0 cloudpickle==2.2.1 colorama==0.4.6 colorlog==4.8.0 +confluent-kafka==2.2.0 connexion==2.14.2 -coverage==7.2.5 +coverage==7.3.0 crcmod==1.7 -cron-descriptor==1.3.0 -croniter==1.3.14 -cryptography==40.0.2 +cron-descriptor==1.4.0 +croniter==1.4.1 +cryptography==41.0.3 curlify==2.2.1 -dask==2023.4.1 -databricks-sql-connector==2.5.2 -datadog==0.45.0 +dask==2023.8.0 +databricks-sql-connector==2.9.2 +datadog==0.46.0 db-dtypes==1.1.1 decorator==5.1.1 defusedxml==0.7.1 deprecation==2.1.0 dill==0.3.1.1 -distlib==0.3.6 -distributed==2023.4.1 -dnspython==2.3.0 -docker==6.1.2 +distlib==0.3.7 +distributed==2023.8.0 +dnspython==2.4.2 +docker==6.1.3 docopt==0.6.2 -docutils==0.20 +docutils==0.20.1 ecdsa==0.18.0 -elasticsearch-dbapi==0.2.10 -elasticsearch-dsl==7.4.1 -elasticsearch==7.13.4 +elasticsearch==7.14.2 email-validator==1.3.1 entrypoints==0.4 eralchemy2==1.3.7 et-xmlfile==1.1.0 eventlet==0.33.3 -exceptiongroup==1.1.1 -execnet==1.9.0 +exceptiongroup==1.1.3 +execnet==2.0.2 executing==1.2.0 -facebook-business==16.0.2 -fastavro==1.7.4 +facebook-business==17.0.4 +fastavro==1.8.2 fasteners==0.18 -fastjsonschema==2.16.3 -filelock==3.12.0 +fastjsonschema==2.18.0 +filelock==3.12.2 fissix==21.11.13 -flower==1.2.0 -frozenlist==1.3.3 -fsspec==2023.5.0 +flower==2.0.1 +frozenlist==1.4.0 +fsspec==2023.6.0 future==0.18.3 -gcloud-aio-auth==4.2.1 +gcloud-aio-auth==4.2.3 gcloud-aio-bigquery==6.3.0 -gcloud-aio-storage==8.2.0 -gcsfs==2023.5.0 +gcloud-aio-storage==8.3.0 +gcsfs==2023.6.0 geomet==0.2.1.post1 -gevent==22.10.2 +gevent==23.7.0 gitdb==4.0.10 -google-api-core==2.8.2 -google-api-python-client==1.12.11 +google-ads==21.3.0 +google-api-core==2.11.1 +google-api-python-client==2.97.0 google-auth-httplib2==0.1.0 -google-auth-oauthlib==0.8.0 -google-auth==2.18.0 -google-cloud-aiplatform==1.16.1 -google-cloud-appengine-logging==1.1.3 -google-cloud-audit-log==0.2.4 -google-cloud-automl==2.8.0 -google-cloud-bigquery-datatransfer==3.7.0 -google-cloud-bigquery-storage==2.14.1 -google-cloud-bigquery==2.34.4 -google-cloud-bigtable==2.11.1 -google-cloud-build==3.9.0 -google-cloud-compute==0.7.0 -google-cloud-container==2.11.1 -google-cloud-core==2.3.2 -google-cloud-datacatalog==3.9.0 -google-cloud-dataflow-client==0.5.4 -google-cloud-dataform==0.2.0 -google-cloud-dataplex==1.1.0 -google-cloud-dataproc-metastore==1.6.0 -google-cloud-dataproc==5.0.0 -google-cloud-dlp==3.8.0 -google-cloud-kms==2.12.0 -google-cloud-language==1.3.2 -google-cloud-logging==3.2.1 -google-cloud-memcache==1.4.1 -google-cloud-monitoring==2.11.0 -google-cloud-orchestration-airflow==1.4.1 -google-cloud-os-login==2.7.1 -google-cloud-pubsub==2.13.5 -google-cloud-redis==2.9.0 -google-cloud-resource-manager==1.6.0 -google-cloud-secret-manager==1.0.2 -google-cloud-spanner==1.19.3 -google-cloud-speech==1.3.4 -google-cloud-storage==2.9.0 -google-cloud-tasks==2.10.1 -google-cloud-texttospeech==1.0.3 -google-cloud-translate==1.7.2 -google-cloud-videointelligence==1.16.3 -google-cloud-vision==1.0.2 -google-cloud-workflows==1.7.1 +google-auth-oauthlib==1.0.0 +google-auth==2.22.0 +google-cloud-aiplatform==1.30.1 +google-cloud-appengine-logging==1.3.1 +google-cloud-audit-log==0.2.5 +google-cloud-automl==2.11.2 +google-cloud-bigquery-datatransfer==3.12.0 +google-cloud-bigquery-storage==2.22.0 +google-cloud-bigquery==3.11.4 +google-cloud-bigtable==2.21.0 +google-cloud-build==3.20.0 +google-cloud-compute==1.14.0 +google-cloud-container==2.30.0 +google-cloud-core==2.3.3 +google-cloud-datacatalog==3.15.0 +google-cloud-dataflow-client==0.8.4 +google-cloud-dataform==0.5.2 +google-cloud-dataplex==1.6.2 +google-cloud-dataproc-metastore==1.12.0 +google-cloud-dataproc==5.4.3 +google-cloud-dlp==3.12.2 +google-cloud-kms==2.19.1 +google-cloud-language==2.11.0 +google-cloud-logging==3.6.0 +google-cloud-memcache==1.7.2 +google-cloud-monitoring==2.15.1 +google-cloud-orchestration-airflow==1.9.1 +google-cloud-os-login==2.10.0 +google-cloud-pubsub==2.18.2 +google-cloud-redis==2.13.1 +google-cloud-resource-manager==1.10.3 +google-cloud-secret-manager==2.16.3 +google-cloud-spanner==3.40.1 +google-cloud-speech==2.21.0 +google-cloud-storage-transfer==1.9.1 +google-cloud-storage==2.10.0 +google-cloud-tasks==2.14.1 +google-cloud-texttospeech==2.14.1 +google-cloud-translate==3.12.0 +google-cloud-videointelligence==2.11.3 +google-cloud-vision==3.4.4 +google-cloud-workflows==1.11.0 google-crc32c==1.5.0 +google-re2==1.1 google-resumable-media==2.5.0 -googleapis-common-protos==1.56.4 +googleapis-common-protos==1.60.0 graphql-core==3.2.3 graphviz==0.20.1 greenlet==2.0.2 -grpc-google-iam-v1==0.12.4 +grpc-google-iam-v1==0.12.6 grpcio-gcp==0.2.2 -grpcio-status==1.48.2 -grpcio==1.54.2 +grpcio-status==1.57.0 +grpcio==1.57.0 gssapi==1.8.2 -gunicorn==20.1.0 +gunicorn==21.2.0 h11==0.14.0 -hdfs==2.7.0 +hdfs==2.7.2 hmsclient==0.1.1 httpcore==0.16.3 -httplib2==0.21.0 +httplib2==0.22.0 httpx==0.23.3 -humanize==4.6.0 -hvac==1.1.0 -identify==2.5.24 +humanize==4.8.0 +hvac==1.1.1 +identify==2.5.26 idna==3.4 -ijson==3.2.0.post0 +ijson==3.2.3 imagesize==1.4.1 -importlib-metadata==6.6.0 -importlib-resources==5.12.0 +importlib-metadata==6.8.0 +importlib-resources==6.0.1 impyla==0.18.0 incremental==22.10.0 inflection==0.5.1 -influxdb-client==1.36.1 +influxdb-client==1.37.0 iniconfig==2.0.0 ipdb==0.13.13 -ipython==8.13.2 +ipython==8.14.0 isodate==0.6.1 itsdangerous==2.1.2 -jaraco.classes==3.2.3 -jedi==0.18.2 +jaraco.classes==3.3.0 +jedi==0.19.0 jeepney==0.8.0 -jira==3.5.0 +jira==3.5.2 jmespath==0.10.0 jschema-to-python==1.2.3 json-merge-patch==0.2 jsondiff==2.0.0 -jsonpatch==1.32 +jsonpatch==1.33 jsonpath-ng==1.5.3 -jsonpickle==3.0.1 -jsonpointer==2.3 -jsonschema-spec==0.1.4 -jsonschema==4.17.3 +jsonpickle==3.0.2 +jsonpointer==2.4 +jsonschema-spec==0.2.4 +jsonschema-specifications==2023.7.1 +jsonschema==4.19.0 junit-xml==1.9 -jupyter_client==8.2.0 -jupyter_core==5.3.0 -keyring==23.13.1 -kombu==5.2.4 +jupyter_client==8.3.0 +jupyter_core==5.3.1 +keyring==24.2.0 +kombu==5.3.1 krb5==0.5.0 kubernetes-asyncio==24.2.3 kubernetes==23.6.0 kylinpy==2.8.4 lazy-object-proxy==1.9.0 ldap3==2.9.1 -limits==3.4.0 +limits==3.5.0 linkify-it-py==2.0.2 locket==1.0.0 lockfile==0.12.2 -looker-sdk==23.8.1 -lxml==4.9.2 +looker-sdk==23.14.1 +lxml==4.9.3 lz4==4.3.2 -markdown-it-py==2.2.0 -marshmallow-enum==1.5.1 +markdown-it-py==3.0.0 marshmallow-oneofschema==3.0.1 marshmallow-sqlalchemy==0.26.1 -marshmallow==3.19.0 +marshmallow==3.20.1 matplotlib-inline==0.1.6 -mdit-py-plugins==0.3.5 +mdit-py-plugins==0.4.0 mdurl==0.1.2 mongomock==4.1.2 monotonic==1.6 -more-itertools==9.1.0 +more-itertools==10.1.0 moreorless==0.4.0 -moto==4.1.9 +moto==4.1.14 mpmath==1.3.0 msal-extensions==1.0.0 -msal==1.22.0 +msal==1.23.0 msgpack==1.0.5 msrest==0.7.1 msrestazure==0.6.4 multi-key-dict==2.0.3 multidict==6.0.4 -mypy-boto3-appflow==1.26.125 -mypy-boto3-rds==1.26.132 -mypy-boto3-redshift-data==1.26.109 +mypy-boto3-appflow==1.28.16 +mypy-boto3-rds==1.28.19 +mypy-boto3-redshift-data==1.28.16 +mypy-boto3-s3==1.28.27 mypy-extensions==1.0.0 -mypy==1.0.0 -mysqlclient==2.1.1 -nbclient==0.7.4 -nbformat==5.8.0 -neo4j==5.8.0 +mypy==1.2.0 +mysql-connector-python==8.1.0 +mysqlclient==2.2.0 +nbclient==0.8.0 +nbformat==5.9.2 +neo4j==5.11.0 networkx==3.1 nodeenv==1.8.0 -numpy==1.24.3 +numpy==1.24.4 oauthlib==3.2.2 objsize==0.6.1 -openapi-schema-validator==0.4.4 -openapi-spec-validator==0.5.6 +openapi-schema-validator==0.6.0 +openapi-spec-validator==0.6.0 +openlineage-integration-common==1.0.0 +openlineage-python==1.0.0 +openlineage_sql==1.0.0 openpyxl==3.1.2 opentelemetry-api==1.15.0 opentelemetry-exporter-otlp-proto-grpc==1.15.0 @@ -422,83 +462,83 @@ opentelemetry-proto==1.15.0 opentelemetry-sdk==1.15.0 opentelemetry-semantic-conventions==0.36b0 opsgenie-sdk==2.1.5 -oracledb==1.3.1 +oracledb==1.4.0 ordered-set==4.1.0 -orjson==3.8.12 +orjson==3.9.5 oscrypto==1.3.0 -oss2==2.17.0 -packaging==21.3 -pandas-gbq==0.17.9 -pandas==1.5.3 +oss2==2.18.1 +packaging==23.1 +pandas-gbq==0.19.2 +pandas==2.0.3 papermill==2.4.0 -paramiko==3.1.0 +paramiko==3.3.1 parso==0.8.3 partd==1.4.0 pathable==0.4.3 -pathspec==0.9.0 +pathspec==0.11.2 pbr==5.11.1 -pdpyras==5.0.1 +pdpyras==5.1.1 pendulum==2.1.2 pexpect==4.8.0 pickleshare==0.7.5 -pinotdb==0.4.14 -pipdeptree==2.7.1 +pinotdb==0.5.0 +pipdeptree==2.13.0 pipx==1.2.0 pkginfo==1.9.6 -platformdirs==3.5.1 -pluggy==1.0.0 +platformdirs==3.8.1 +pluggy==1.2.0 ply==3.11 plyvel==1.5.0 portalocker==2.7.0 -pre-commit==3.3.1 +pre-commit==3.3.3 presto-python-client==0.8.3 prison==0.2.1 -prometheus-client==0.16.0 -prompt-toolkit==3.0.38 -proto-plus==1.19.6 -protobuf==3.20.0 +prometheus-client==0.17.1 +prompt-toolkit==3.0.39 +proto-plus==1.22.3 +protobuf==4.21.12 psutil==5.9.5 -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 ptyprocess==0.7.0 pure-eval==0.2.2 pure-sasl==0.6.2 -py-partiql-parser==0.3.0 +py-partiql-parser==0.3.6 py4j==0.10.9.7 -pyOpenSSL==23.1.1 -pyarrow==9.0.0 +pyOpenSSL==23.2.0 +pyarrow==11.0.0 pyasn1-modules==0.2.8 pyasn1==0.4.8 pycountry==22.3.5 pycparser==2.21 -pycryptodome==3.17 -pycryptodomex==3.17 -pydantic==1.10.7 -pydata-google-auth==1.8.0 +pycryptodome==3.18.0 +pycryptodomex==3.18.0 +pydantic==1.10.12 +pydata-google-auth==1.8.2 pydot==1.4.2 pydruid==0.6.5 pyenchant==3.2.2 pyexasol==0.25.2 -pygraphviz==1.10 +pygraphviz==1.11 pyhcl==0.4.4 pykerberos==1.2.4 -pymongo==3.13.0 -pymssql==2.2.7 +pymongo==4.4.1 +pymssql==2.2.8 pyodbc==4.0.39 -pyparsing==3.0.9 +pyparsing==3.1.1 pypsrp==0.8.1 -pyrsistent==0.19.3 -pyspark==3.4.0 -pyspnego==0.9.0 -pytest-asyncio==0.21.0 +pyspark==3.4.1 +pyspnego==0.9.1 +pytest-asyncio==0.21.1 pytest-capture-warnings==0.0.4 -pytest-cov==4.0.0 +pytest-cov==4.1.0 pytest-httpx==0.21.3 pytest-instafail==0.5.0 -pytest-rerunfailures==11.1.2 +pytest-mock==3.11.1 +pytest-rerunfailures==12.0 pytest-timeouts==1.2.1 -pytest-xdist==3.3.0 -pytest==7.3.1 -python-arango==7.5.7 +pytest-xdist==3.3.1 +pytest==7.4.0 +python-arango==7.6.0 python-daemon==3.0.1 python-dateutil==2.8.2 python-dotenv==1.0.0 @@ -512,70 +552,72 @@ python-telegram-bot==20.2 pytz==2023.3 pytzdata==2020.1 pywinrm==0.4.3 -pyzmq==25.0.2 +pyzmq==25.1.1 qds-sdk==1.16.1 reactivex==4.0.4 -readme-renderer==37.3 -redis==3.5.3 -redshift-connector==2.0.910 -regex==2023.5.5 +readme-renderer==40.0 +redis==5.0.0 +redshift-connector==2.0.913 +referencing==0.30.2 +regex==2023.8.8 requests-file==1.5.1 requests-kerberos==0.14.0 -requests-mock==1.10.0 +requests-mock==1.11.0 requests-ntlm==1.2.0 requests-oauthlib==1.3.1 requests-toolbelt==1.0.0 -requests==2.30.0 -responses==0.23.1 +requests==2.31.0 +responses==0.23.3 rfc3339-validator==0.1.4 rfc3986==1.5.0 +rich-argparse==1.2.0 rich-click==1.6.1 -rich==13.3.5 -rich_argparse==1.1.0 +rich==13.5.2 +rpds-py==0.9.2 rsa==4.9 -ruff==0.0.267 -s3transfer==0.6.1 +ruff==0.0.285 +s3transfer==0.6.2 sarif-om==1.0.4 sasl==0.3.1 scramp==1.4.4 scrapbook==0.5.0 -semver==3.0.0 +semver==3.0.1 sendgrid==6.10.0 sentinels==1.0.0 -sentry-sdk==1.22.2 +sentry-sdk==1.29.2 setproctitle==1.3.2 -simple-salesforce==1.12.3 +simple-salesforce==1.12.4 six==1.16.0 slack-sdk==3.21.3 smbprotocol==1.10.1 smmap==5.0.0 -snakebite-py3==3.0.5 sniffio==1.3.0 snowballstemmer==2.2.0 -snowflake-connector-python==3.0.3 +snowflake-connector-python==3.1.0 snowflake-sqlalchemy==1.4.7 sortedcontainers==2.4.0 soupsieve==2.4.1 sphinx-airflow-theme==0.0.12 sphinx-argparse==0.4.0 -sphinx-autoapi==2.1.0 +sphinx-autoapi==2.1.1 sphinx-copybutton==0.5.2 sphinx-jinja==2.0.2 -sphinx-rtd-theme==1.2.0 -sphinxcontrib-applehelp==1.0.4 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-htmlhelp==2.0.1 +sphinx-rtd-theme==1.2.2 +sphinxcontrib-applehelp==1.0.7 +sphinxcontrib-devhelp==1.0.5 +sphinxcontrib-htmlhelp==2.0.4 sphinxcontrib-httpdomain==1.8.1 sphinxcontrib-jquery==4.1 sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-qthelp==1.0.6 sphinxcontrib-redoc==1.6.0 -sphinxcontrib-serializinghtml==1.1.5 +sphinxcontrib-serializinghtml==1.1.8 sphinxcontrib-spelling==8.0.0 spython==0.3.0 -sqlalchemy-bigquery==1.6.1 +sqlalchemy-bigquery==1.8.0 sqlalchemy-drill==1.1.2 sqlalchemy-redshift==0.8.14 +sqlalchemy-spanner==1.6.2 sqlparse==0.4.4 sshpubkeys==3.3.1 sshtunnel==0.4.0 @@ -583,69 +625,70 @@ stack-data==0.6.2 starkbank-ecdsa==2.2.0 statsd==4.0.1 sympy==1.12 -tableauserverclient==0.24 +tableauserverclient==0.25 tabulate==0.9.0 -tblib==1.7.0 -tenacity==8.2.2 +tblib==2.0.0 +tenacity==8.2.3 termcolor==2.3.0 text-unidecode==1.3 textwrap3==0.9.2 thrift-sasl==0.4.3 thrift==0.16.0 -time-machine==2.9.0 +time-machine==2.12.0 tomli==2.0.1 +tomlkit==0.12.1 toolz==0.12.0 -tornado==6.3.2 -towncrier==22.12.0 -tqdm==4.65.0 +tornado==6.3.3 +towncrier==23.6.0 +tqdm==4.66.1 traitlets==5.9.0 -trino==0.323.0 +trino==0.326.0 twine==4.0.2 -types-Deprecated==1.2.9.2 -types-Markdown==3.4.2.9 -types-PyMySQL==1.0.19.7 -types-PyYAML==6.0.12.9 -types-boto==2.49.18.8 +types-Deprecated==1.2.9.3 +types-Markdown==3.4.2.10 +types-PyMySQL==1.1.0.1 +types-PyYAML==6.0.12.11 +types-boto==2.49.18.9 types-certifi==2021.10.8.3 -types-croniter==1.3.2.9 -types-docutils==0.20.0.1 -types-paramiko==3.0.0.10 -types-protobuf==4.23.0.1 -types-pyOpenSSL==23.1.0.3 -types-python-dateutil==2.8.19.13 -types-python-slugify==8.0.0.2 -types-pytz==2023.3.0.0 -types-redis==4.5.5.2 -types-requests==2.30.0.0 -types-setuptools==67.7.0.2 -types-tabulate==0.9.0.2 +types-croniter==1.4.0.1 +types-docutils==0.20.0.3 +types-paramiko==3.3.0.0 +types-protobuf==4.24.0.1 +types-pyOpenSSL==23.2.0.2 +types-python-dateutil==2.8.19.14 +types-python-slugify==8.0.0.3 +types-pytz==2023.3.0.1 +types-redis==4.6.0.4 +types-requests==2.31.0.2 +types-setuptools==68.1.0.0 +types-tabulate==0.9.0.3 types-termcolor==1.1.6.2 -types-toml==0.10.8.6 -types-urllib3==1.26.25.13 -typing_extensions==4.5.0 -tzlocal==5.0 +types-toml==0.10.8.7 +types-urllib3==1.26.25.14 +typing_extensions==4.7.1 +tzdata==2023.3 +tzlocal==5.0.1 uc-micro-py==1.0.2 unicodecsv==0.14.1 -uritemplate==3.0.1 -urllib3==1.26.15 -userpath==1.8.0 -vertica-python==1.3.2 +uritemplate==4.1.1 +urllib3==1.26.16 +userpath==1.9.0 +vertica-python==1.3.4 vine==5.0.0 -virtualenv==20.23.0 +virtualenv==20.24.1 volatile==2.1.0 watchtower==2.0.1 wcwidth==0.2.6 webencodings==0.5.1 -websocket-client==1.5.1 +websocket-client==1.6.1 wrapt==1.15.0 xmltodict==0.13.0 -yamllint==1.31.0 -yandexcloud==0.212.0 +yamllint==1.32.0 yarl==1.9.2 zeep==4.2.1 -zenpy==2.0.25 +zenpy==2.0.27 zict==3.0.0 -zipp==3.15.0 -zope.event==4.6 +zipp==3.16.2 +zope.event==5.0 zope.interface==6.0 zstandard==0.21.0 diff --git a/pipeline/requirements/airflow/requirements.txt b/pipeline/requirements/airflow/requirements.txt index febcce28b..bf01bdcac 100644 --- a/pipeline/requirements/airflow/requirements.txt +++ b/pipeline/requirements/airflow/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=requirements/airflow/requirements.txt --resolver=backtracking requirements/airflow/requirements.in # -aiohttp==3.8.4 +aiohttp==3.8.5 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-http @@ -12,59 +12,65 @@ aiosignal==1.3.1 # via # -c requirements/airflow/constraints.txt # aiohttp -alembic==1.10.4 +alembic==1.11.3 # via # -c requirements/airflow/constraints.txt # apache-airflow -anyio==3.6.2 +anyio==3.7.1 # via # -c requirements/airflow/constraints.txt # httpcore -apache-airflow[amazon,postgres]==2.6.1 +apache-airflow[amazon,postgres]==2.7.0 # via # -r requirements/airflow/base.in # apache-airflow-providers-amazon + # apache-airflow-providers-common-sql + # apache-airflow-providers-ftp + # apache-airflow-providers-http + # apache-airflow-providers-imap # apache-airflow-providers-postgres -apache-airflow-providers-amazon==8.0.0 + # apache-airflow-providers-sqlite +apache-airflow-providers-amazon==8.5.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-common-sql==1.4.0 +apache-airflow-providers-common-sql==1.7.0 # via # -c requirements/airflow/constraints.txt # apache-airflow # apache-airflow-providers-amazon # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-ftp==3.3.1 +apache-airflow-providers-ftp==3.5.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-http==4.3.0 +apache-airflow-providers-http==4.5.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-imap==3.1.1 + # apache-airflow-providers-amazon +apache-airflow-providers-imap==3.3.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-postgres==5.4.0 +apache-airflow-providers-postgres==5.6.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -apache-airflow-providers-sqlite==3.3.2 +apache-airflow-providers-sqlite==3.4.3 # via # -c requirements/airflow/constraints.txt # apache-airflow -apispec[yaml]==5.2.2 +apispec[yaml]==6.3.0 # via # -c requirements/airflow/constraints.txt # flask-appbuilder -argcomplete==3.0.8 +argcomplete==3.1.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -asgiref==3.6.0 +asgiref==3.7.2 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -74,7 +80,7 @@ asn1crypto==1.5.1 # via # -c requirements/airflow/constraints.txt # scramp -async-timeout==4.0.2 +async-timeout==4.0.3 # via # -c requirements/airflow/constraints.txt # aiohttp @@ -85,10 +91,16 @@ attrs==23.1.0 # apache-airflow # cattrs # jsonschema + # referencing babel==2.12.1 # via # -c requirements/airflow/constraints.txt # flask-babel +backoff==1.10.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http beautifulsoup4==4.12.2 # via # -c requirements/airflow/constraints.txt @@ -97,13 +109,13 @@ blinker==1.6.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -boto3==1.26.76 +boto3==1.28.17 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon # redshift-connector # watchtower -botocore==1.29.76 +botocore==1.31.17 # via # -c requirements/airflow/constraints.txt # boto3 @@ -114,11 +126,11 @@ cachelib==0.9.0 # -c requirements/airflow/constraints.txt # flask-caching # flask-session -cattrs==22.2.0 +cattrs==23.1.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -certifi==2023.5.7 +certifi==2023.7.22 # via # -c requirements/airflow/constraints.txt # httpcore @@ -128,12 +140,12 @@ cffi==1.15.1 # via # -c requirements/airflow/constraints.txt # cryptography -charset-normalizer==2.1.1 +charset-normalizer==3.2.0 # via # -c requirements/airflow/constraints.txt # aiohttp # requests -click==8.1.3 +click==8.1.7 # via # -c requirements/airflow/constraints.txt # clickclick @@ -159,15 +171,15 @@ connexion[flask]==2.14.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -cron-descriptor==1.3.0 +cron-descriptor==1.4.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -croniter==1.3.14 +croniter==1.4.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -cryptography==40.0.2 +cryptography==41.0.3 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -175,20 +187,21 @@ decorator==5.1.1 # via # -c requirements/airflow/constraints.txt # jsonpath-ng -deprecated==1.2.13 +deprecated==1.2.14 # via # -c requirements/airflow/constraints.txt # apache-airflow # limits + # opentelemetry-api dill==0.3.1.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -dnspython==2.3.0 +dnspython==2.4.2 # via # -c requirements/airflow/constraints.txt # email-validator -docutils==0.20 +docutils==0.20.1 # via # -c requirements/airflow/constraints.txt # python-daemon @@ -196,9 +209,10 @@ email-validator==1.3.1 # via # -c requirements/airflow/constraints.txt # flask-appbuilder -exceptiongroup==1.1.1 +exceptiongroup==1.1.3 # via # -c requirements/airflow/constraints.txt + # anyio # cattrs flask==2.2.5 # via @@ -214,7 +228,7 @@ flask==2.2.5 # flask-session # flask-sqlalchemy # flask-wtf -flask-appbuilder==4.3.0 +flask-appbuilder==4.3.3 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -226,7 +240,7 @@ flask-caching==2.0.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -flask-jwt-extended==4.4.4 +flask-jwt-extended==4.5.2 # via # -c requirements/airflow/constraints.txt # flask-appbuilder @@ -252,11 +266,20 @@ flask-wtf==1.1.1 # -c requirements/airflow/constraints.txt # apache-airflow # flask-appbuilder -frozenlist==1.3.3 +frozenlist==1.4.0 # via # -c requirements/airflow/constraints.txt # aiohttp # aiosignal +google-re2==1.1 + # via + # -c requirements/airflow/constraints.txt + # apache-airflow +googleapis-common-protos==1.60.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http graphviz==0.20.1 # via # -c requirements/airflow/constraints.txt @@ -265,7 +288,11 @@ greenlet==2.0.2 # via # -c requirements/airflow/constraints.txt # sqlalchemy -gunicorn==20.1.0 +grpcio==1.57.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc +gunicorn==21.2.0 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -289,7 +316,7 @@ idna==3.4 # requests # rfc3986 # yarl -importlib-resources==5.12.0 +importlib-resources==6.0.1 # via # -c requirements/airflow/constraints.txt # limits @@ -320,17 +347,21 @@ jsonpath-ng==1.5.3 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon -jsonschema==4.17.3 +jsonschema==4.19.0 # via # -c requirements/airflow/constraints.txt # apache-airflow # connexion # flask-appbuilder +jsonschema-specifications==2023.7.1 + # via + # -c requirements/airflow/constraints.txt + # jsonschema lazy-object-proxy==1.9.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -limits==3.4.0 +limits==3.5.0 # via # -c requirements/airflow/constraints.txt # flask-limiter @@ -343,7 +374,7 @@ lockfile==0.12.2 # -c requirements/airflow/constraints.txt # apache-airflow # python-daemon -lxml==4.9.2 +lxml==4.9.3 # via # -c requirements/airflow/constraints.txt # redshift-connector @@ -351,17 +382,17 @@ mako==1.2.4 # via # -c requirements/airflow/constraints.txt # alembic -markdown==3.4.3 +markdown==3.4.4 # via # -c requirements/airflow/constraints.txt # apache-airflow -markdown-it-py==2.2.0 +markdown-it-py==3.0.0 # via # -c requirements/airflow/constraints.txt # apache-airflow # mdit-py-plugins # rich -markupsafe==2.1.2 +markupsafe==2.1.3 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -369,17 +400,12 @@ markupsafe==2.1.2 # mako # werkzeug # wtforms -marshmallow==3.19.0 +marshmallow==3.20.1 # via # -c requirements/airflow/constraints.txt # flask-appbuilder - # marshmallow-enum # marshmallow-oneofschema # marshmallow-sqlalchemy -marshmallow-enum==1.5.1 - # via - # -c requirements/airflow/constraints.txt - # flask-appbuilder marshmallow-oneofschema==3.0.1 # via # -c requirements/airflow/constraints.txt @@ -388,7 +414,7 @@ marshmallow-sqlalchemy==0.26.1 # via # -c requirements/airflow/constraints.txt # flask-appbuilder -mdit-py-plugins==0.3.5 +mdit-py-plugins==0.4.0 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -401,32 +427,71 @@ multidict==6.0.4 # -c requirements/airflow/constraints.txt # aiohttp # yarl -mypy-boto3-appflow==1.26.125 +mypy-boto3-appflow==1.28.16 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon -mypy-boto3-rds==1.26.132 +mypy-boto3-rds==1.28.19 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon -mypy-boto3-redshift-data==1.26.109 +mypy-boto3-redshift-data==1.28.16 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon +mypy-boto3-s3==1.28.27 + # via + # -c requirements/airflow/constraints.txt + # apache-airflow-providers-amazon +opentelemetry-api==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # apache-airflow + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk +opentelemetry-exporter-otlp==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # apache-airflow +opentelemetry-exporter-otlp-proto-grpc==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp +opentelemetry-proto==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.15.0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.36b0 + # via + # -c requirements/airflow/constraints.txt + # opentelemetry-sdk ordered-set==4.1.0 # via # -c requirements/airflow/constraints.txt # flask-limiter -packaging==21.3 +packaging==23.1 # via # -c requirements/airflow/constraints.txt # apache-airflow + # apispec # connexion + # gunicorn # limits # marshmallow # redshift-connector # sqlalchemy-redshift -pathspec==0.9.0 +pathspec==0.11.2 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -434,7 +499,7 @@ pendulum==2.1.2 # via # -c requirements/airflow/constraints.txt # apache-airflow -pluggy==1.0.0 +pluggy==1.2.0 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -446,13 +511,18 @@ prison==0.2.1 # via # -c requirements/airflow/constraints.txt # flask-appbuilder +protobuf==4.21.12 + # via + # -c requirements/airflow/constraints.txt + # googleapis-common-protos + # opentelemetry-proto psutil==5.9.5 # via # -c requirements/airflow/constraints.txt # apache-airflow -psycopg2==2.9.6 +psycopg2==2.9.7 # via -r requirements/airflow/base.in -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-postgres @@ -460,29 +530,21 @@ pycparser==2.21 # via # -c requirements/airflow/constraints.txt # cffi -pydantic==1.10.7 +pydantic==1.10.12 # via # -c requirements/airflow/constraints.txt # apache-airflow -pygments==2.15.1 +pygments==2.16.1 # via # -c requirements/airflow/constraints.txt # apache-airflow # rich -pyjwt==2.7.0 +pyjwt==2.8.0 # via # -c requirements/airflow/constraints.txt # apache-airflow # flask-appbuilder # flask-jwt-extended -pyparsing==3.0.9 - # via - # -c requirements/airflow/constraints.txt - # packaging -pyrsistent==0.19.3 - # via - # -c requirements/airflow/constraints.txt - # jsonschema python-daemon==3.0.1 # via # -c requirements/airflow/constraints.txt @@ -513,21 +575,27 @@ pytzdata==2020.1 # via # -c requirements/airflow/constraints.txt # pendulum -pyyaml==6.0 +pyyaml==6.0.1 # via # -c requirements/airflow/constraints.txt # apispec # clickclick # connexion -redshift-connector==2.0.910 +redshift-connector==2.0.913 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-amazon -requests==2.30.0 +referencing==0.30.2 + # via + # -c requirements/airflow/constraints.txt + # jsonschema + # jsonschema-specifications +requests==2.31.0 # via # -c requirements/airflow/constraints.txt # apache-airflow-providers-http # connexion + # opentelemetry-exporter-otlp-proto-http # redshift-connector # requests-toolbelt requests-toolbelt==1.0.0 @@ -542,17 +610,22 @@ rfc3986[idna2008]==1.5.0 # via # -c requirements/airflow/constraints.txt # httpx -rich==13.3.5 +rich==13.5.2 # via # -c requirements/airflow/constraints.txt # apache-airflow # flask-limiter # rich-argparse -rich-argparse==1.1.0 +rich-argparse==1.2.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -s3transfer==0.6.1 +rpds-py==0.9.2 + # via + # -c requirements/airflow/constraints.txt + # jsonschema + # referencing +s3transfer==0.6.2 # via # -c requirements/airflow/constraints.txt # boto3 @@ -581,7 +654,7 @@ soupsieve==2.4.1 # via # -c requirements/airflow/constraints.txt # beautifulsoup4 -sqlalchemy==1.4.48 +sqlalchemy==1.4.49 # via # -c requirements/airflow/constraints.txt # alembic @@ -612,7 +685,7 @@ tabulate==0.9.0 # via # -c requirements/airflow/constraints.txt # apache-airflow -tenacity==8.2.2 +tenacity==8.2.3 # via # -c requirements/airflow/constraints.txt # apache-airflow @@ -624,13 +697,16 @@ text-unidecode==1.3 # via # -c requirements/airflow/constraints.txt # python-slugify -typing-extensions==4.5.0 +typing-extensions==4.7.1 # via # -c requirements/airflow/constraints.txt # alembic # apache-airflow + # asgiref + # cattrs # flask-limiter # limits + # opentelemetry-sdk # pydantic uc-micro-py==1.0.2 # via @@ -640,7 +716,7 @@ unicodecsv==0.14.1 # via # -c requirements/airflow/constraints.txt # apache-airflow -urllib3==1.26.15 +urllib3==1.26.16 # via # -c requirements/airflow/constraints.txt # botocore diff --git a/pipeline/requirements/dev/requirements.in b/pipeline/requirements/dev/requirements.in index 27ae5a3e8..165b0fbb6 100644 --- a/pipeline/requirements/dev/requirements.in +++ b/pipeline/requirements/dev/requirements.in @@ -1,7 +1,7 @@ -r ../airflow/base.in -r ../tasks/python/requirements.in -pytest black pre-commit +pytest tox diff --git a/pipeline/requirements/dev/requirements.txt b/pipeline/requirements/dev/requirements.txt index 58bc698f3..17780236b 100644 --- a/pipeline/requirements/dev/requirements.txt +++ b/pipeline/requirements/dev/requirements.txt @@ -2,17 +2,17 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=requirements/dev/requirements.txt --resolver=backtracking requirements/dev/requirements.in +# pip-compile requirements/dev/requirements.in # -aiohttp==3.8.4 +aiohttp==3.8.5 # via apache-airflow-providers-http aiosignal==1.3.1 # via aiohttp -alembic==1.11.1 +alembic==1.12.0 # via apache-airflow -anyio==3.7.0 +anyio==4.0.0 # via httpcore -apache-airflow[amazon,postgres]==2.6.1 +apache-airflow[amazon,postgres]==2.7.0 # via # -r requirements/dev/../airflow/base.in # apache-airflow-providers-amazon @@ -22,29 +22,31 @@ apache-airflow[amazon,postgres]==2.6.1 # apache-airflow-providers-imap # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-amazon==8.1.0 +apache-airflow-providers-amazon==8.6.0 # via # -r requirements/dev/../tasks/python/requirements.in # apache-airflow -apache-airflow-providers-common-sql==1.5.1 +apache-airflow-providers-common-sql==1.7.1 # via # apache-airflow # apache-airflow-providers-amazon # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-ftp==3.4.1 +apache-airflow-providers-ftp==3.5.1 # via apache-airflow -apache-airflow-providers-http==4.4.1 - # via apache-airflow -apache-airflow-providers-imap==3.2.1 +apache-airflow-providers-http==4.5.1 + # via + # apache-airflow + # apache-airflow-providers-amazon +apache-airflow-providers-imap==3.3.1 # via apache-airflow -apache-airflow-providers-postgres==5.5.0 +apache-airflow-providers-postgres==5.6.0 # via # -r requirements/dev/../tasks/python/requirements.in # apache-airflow -apache-airflow-providers-sqlite==3.4.1 +apache-airflow-providers-sqlite==3.4.3 # via apache-airflow -apispec[yaml]==5.2.2 +apispec[yaml]==6.3.0 # via flask-appbuilder argcomplete==3.1.1 # via apache-airflow @@ -55,7 +57,7 @@ asgiref==3.7.2 # apache-airflow-providers-http asn1crypto==1.5.1 # via scramp -async-timeout==4.0.2 +async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via @@ -64,21 +66,31 @@ attrs==23.1.0 # cattrs # fiona # jsonschema + # referencing babel==2.12.1 # via flask-babel +backoff==2.2.1 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +backports-datetime-fromisoformat==2.0.0 + # via htmldate beautifulsoup4==4.12.2 - # via redshift-connector -black==23.3.0 + # via + # -r requirements/dev/../tasks/python/requirements.in + # redshift-connector +black==23.7.0 # via -r requirements/dev/requirements.in blinker==1.6.2 # via apache-airflow -boto3==1.26.151 +boto3==1.28.40 # via # apache-airflow-providers-amazon # redshift-connector # watchtower -botocore==1.29.151 +botocore==1.31.40 # via + # apache-airflow-providers-amazon # boto3 # redshift-connector # s3transfer @@ -90,7 +102,7 @@ cachetools==5.3.1 # via tox cattrs==23.1.2 # via apache-airflow -certifi==2023.5.7 +certifi==2023.7.22 # via # fiona # httpcore @@ -100,17 +112,17 @@ certifi==2023.5.7 # trafilatura cffi==1.15.1 # via cryptography -cfgv==3.3.1 +cfgv==3.4.0 # via pre-commit -chardet==5.1.0 +chardet==5.2.0 # via tox -charset-normalizer==3.1.0 +charset-normalizer==3.2.0 # via # aiohttp # htmldate # requests # trafilatura -click==8.1.3 +click==8.1.7 # via # black # click-plugins @@ -139,9 +151,9 @@ courlan==0.9.3 # via trafilatura cron-descriptor==1.4.0 # via apache-airflow -croniter==1.3.15 +croniter==1.4.1 # via apache-airflow -cryptography==41.0.1 +cryptography==41.0.3 # via apache-airflow dateparser==1.1.8 # via htmldate @@ -151,11 +163,12 @@ deprecated==1.2.14 # via # apache-airflow # limits -dill==0.3.6 + # opentelemetry-api +dill==0.3.7 # via apache-airflow -distlib==0.3.6 +distlib==0.3.7 # via virtualenv -dnspython==2.3.0 +dnspython==2.4.2 # via email-validator docutils==0.20.1 # via python-daemon @@ -163,12 +176,12 @@ email-validator==1.3.1 # via flask-appbuilder et-xmlfile==1.1.0 # via openpyxl -exceptiongroup==1.1.1 +exceptiongroup==1.1.3 # via # anyio # cattrs # pytest -filelock==3.12.1 +filelock==3.12.3 # via # tox # virtualenv @@ -187,7 +200,7 @@ flask==2.2.5 # flask-session # flask-sqlalchemy # flask-wtf -flask-appbuilder==4.3.0 +flask-appbuilder==4.3.3 # via apache-airflow flask-babel==2.0.0 # via flask-appbuilder @@ -195,7 +208,7 @@ flask-caching==2.0.2 # via apache-airflow flask-jwt-extended==4.5.2 # via flask-appbuilder -flask-limiter==3.3.1 +flask-limiter==3.5.0 # via flask-appbuilder flask-login==0.6.2 # via @@ -209,29 +222,37 @@ flask-wtf==1.1.1 # via # apache-airflow # flask-appbuilder -frozenlist==1.3.3 +frozenlist==1.4.0 # via # aiohttp # aiosignal -geoalchemy2==0.13.3 +geoalchemy2==0.14.1 # via -r requirements/dev/../tasks/python/requirements.in geopandas==0.13.2 # via -r requirements/dev/../tasks/python/requirements.in +google-re2==1.1 + # via apache-airflow +googleapis-common-protos==1.60.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http graphviz==0.20.1 # via apache-airflow greenlet==2.0.2 # via sqlalchemy -gunicorn==20.1.0 +grpcio==1.57.0 + # via opentelemetry-exporter-otlp-proto-grpc +gunicorn==21.2.0 # via apache-airflow h11==0.14.0 # via httpcore -htmldate==1.4.3 +htmldate==1.5.0 # via trafilatura -httpcore==0.17.2 +httpcore==0.17.3 # via httpx httpx==0.24.1 # via apache-airflow -identify==2.5.24 +identify==2.5.27 # via pre-commit idna==3.4 # via @@ -240,10 +261,12 @@ idna==3.4 # httpx # requests # yarl -importlib-resources==5.12.0 +importlib-resources==6.0.1 # via limits inflection==0.5.1 - # via connexion + # via + # connexion + # pyairtable iniconfig==2.0.0 # via pytest itsdangerous==2.1.2 @@ -264,18 +287,20 @@ jmespath==1.0.1 # botocore jsonpath-ng==1.5.3 # via apache-airflow-providers-amazon -jsonschema==4.17.3 +jsonschema==4.19.0 # via # apache-airflow # connexion # flask-appbuilder +jsonschema-specifications==2023.7.1 + # via jsonschema justext==3.0.0 # via trafilatura langcodes==3.3.0 # via courlan lazy-object-proxy==1.9.0 # via apache-airflow -limits==3.5.0 +limits==3.6.0 # via flask-limiter linkify-it-py==2.0.2 # via apache-airflow @@ -283,7 +308,7 @@ lockfile==0.12.2 # via # apache-airflow # python-daemon -lxml==4.9.2 +lxml==4.9.3 # via # htmldate # justext @@ -291,9 +316,9 @@ lxml==4.9.2 # trafilatura mako==1.2.4 # via alembic -markdown==3.4.3 +markdown==3.4.4 # via apache-airflow -markdown-it-py==2.2.0 +markdown-it-py==3.0.0 # via # apache-airflow # mdit-py-plugins @@ -305,14 +330,11 @@ markupsafe==2.1.3 # mako # werkzeug # wtforms -marshmallow==3.19.0 +marshmallow==3.20.1 # via # flask-appbuilder - # marshmallow-enum # marshmallow-oneofschema # marshmallow-sqlalchemy -marshmallow-enum==1.5.1 - # via flask-appbuilder marshmallow-oneofschema==3.0.1 # via apache-airflow marshmallow-sqlalchemy==0.26.1 @@ -325,33 +347,57 @@ multidict==6.0.4 # via # aiohttp # yarl -mypy-boto3-appflow==1.26.145 +mypy-boto3-appflow==1.28.38 # via apache-airflow-providers-amazon -mypy-boto3-rds==1.26.144 +mypy-boto3-rds==1.28.36 # via apache-airflow-providers-amazon -mypy-boto3-redshift-data==1.26.109 +mypy-boto3-redshift-data==1.28.36 # via apache-airflow-providers-amazon -mypy-boto3-s3==1.26.127 +mypy-boto3-s3==1.28.36 # via apache-airflow-providers-amazon mypy-extensions==1.0.0 # via black nodeenv==1.8.0 # via pre-commit -numpy==1.24.3 +numpy==1.25.2 # via # pandas # shapely openpyxl==3.1.2 # via -r requirements/dev/../tasks/python/requirements.in +opentelemetry-api==1.15.0 + # via + # apache-airflow + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk +opentelemetry-exporter-otlp==1.15.0 + # via apache-airflow +opentelemetry-exporter-otlp-proto-grpc==1.15.0 + # via opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.15.0 + # via opentelemetry-exporter-otlp +opentelemetry-proto==1.15.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.15.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.36b0 + # via opentelemetry-sdk ordered-set==4.1.0 # via flask-limiter packaging==23.1 # via # apache-airflow + # apispec # black # connexion # geoalchemy2 # geopandas + # gunicorn # limits # marshmallow # pyproject-api @@ -359,62 +405,66 @@ packaging==23.1 # redshift-connector # sqlalchemy-redshift # tox -pandas==2.0.2 +pandas==2.1.0 # via # -r requirements/dev/../tasks/python/requirements.in # geopandas -pathspec==0.9.0 +pathspec==0.11.2 # via # apache-airflow # black pendulum==2.1.2 # via apache-airflow -platformdirs==3.5.3 +platformdirs==3.10.0 # via # black # tox # virtualenv -pluggy==1.0.0 +pluggy==1.3.0 # via # apache-airflow # pytest # tox ply==3.11 # via jsonpath-ng -pre-commit==3.3.2 +pre-commit==3.4.0 # via -r requirements/dev/requirements.in prison==0.2.1 # via flask-appbuilder +protobuf==4.24.2 + # via + # googleapis-common-protos + # opentelemetry-proto psutil==5.9.5 # via apache-airflow -psycopg2==2.9.6 +psycopg2==2.9.7 # via # -r requirements/dev/../airflow/base.in # -r requirements/dev/../tasks/python/requirements.in -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 # via apache-airflow-providers-postgres -pyairtable==1.5.0 +pyairtable==2.1.0.post1 # via -r requirements/dev/../tasks/python/requirements.in pycparser==2.21 # via cffi -pydantic==1.10.9 - # via apache-airflow -pygments==2.15.1 +pydantic==1.10.12 + # via + # apache-airflow + # pyairtable +pygments==2.16.1 # via # apache-airflow # rich -pyjwt==2.7.0 +pyjwt==2.8.0 # via # apache-airflow # flask-appbuilder # flask-jwt-extended -pyproj==3.5.0 +pyproj==3.6.0 # via geopandas -pyproject-api==1.5.1 +pyproject-api==1.6.1 # via tox -pyrsistent==0.19.3 - # via jsonschema -pytest==7.3.2 +pytest==7.4.1 # via -r requirements/dev/requirements.in python-daemon==3.0.1 # via apache-airflow @@ -434,7 +484,7 @@ python-slugify==8.0.1 # via # apache-airflow # python-nvd3 -pytz==2023.3 +pytz==2023.3.post1 # via # dateparser # flask-babel @@ -442,21 +492,26 @@ pytz==2023.3 # redshift-connector pytzdata==2020.1 # via pendulum -pyyaml==6.0 +pyyaml==6.0.1 # via # apispec # clickclick # connexion # pre-commit -redshift-connector==2.0.911 +redshift-connector==2.0.913 # via apache-airflow-providers-amazon -regex==2023.6.3 +referencing==0.30.2 + # via + # jsonschema + # jsonschema-specifications +regex==2023.8.8 # via dateparser requests==2.31.0 # via # -r requirements/dev/../tasks/python/requirements.in # apache-airflow-providers-http # connexion + # opentelemetry-exporter-otlp-proto-http # pyairtable # redshift-connector # requests-toolbelt @@ -464,14 +519,18 @@ requests-toolbelt==1.0.0 # via apache-airflow-providers-http rfc3339-validator==0.1.4 # via apache-airflow -rich==13.4.1 +rich==13.5.2 # via # apache-airflow # flask-limiter # rich-argparse -rich-argparse==1.1.1 +rich-argparse==1.3.0 # via apache-airflow -s3transfer==0.6.1 +rpds-py==0.10.2 + # via + # jsonschema + # referencing +s3transfer==0.6.2 # via boto3 scramp==1.4.4 # via redshift-connector @@ -491,9 +550,9 @@ sniffio==1.3.0 # anyio # httpcore # httpx -soupsieve==2.4.1 +soupsieve==2.5 # via beautifulsoup4 -sqlalchemy==1.4.48 +sqlalchemy==1.4.49 # via # -r requirements/dev/../tasks/python/requirements.in # alembic @@ -515,7 +574,7 @@ sqlparse==0.4.4 # via apache-airflow-providers-common-sql tabulate==0.9.0 # via apache-airflow -tenacity==8.2.2 +tenacity==8.2.3 # via # -r requirements/dev/../tasks/python/requirements.in # apache-airflow @@ -531,20 +590,27 @@ tomli==2.0.1 # pyproject-api # pytest # tox -tox==4.6.0 +tox==4.11.1 # via -r requirements/dev/requirements.in -tqdm==4.65.0 +tqdm==4.66.1 # via -r requirements/dev/../tasks/python/requirements.in trafilatura==1.6.1 # via -r requirements/dev/../tasks/python/requirements.in -typing-extensions==4.6.3 +typing-extensions==4.7.1 # via # alembic # apache-airflow # asgiref # cattrs + # filelock # flask-limiter # limits + # mypy-boto3-appflow + # mypy-boto3-rds + # mypy-boto3-redshift-data + # mypy-boto3-s3 + # opentelemetry-sdk + # pyairtable # pydantic tzdata==2023.3 # via pandas @@ -563,7 +629,7 @@ urllib3==1.26.16 # pyairtable # requests # trafilatura -virtualenv==20.23.0 +virtualenv==20.24.4 # via # pre-commit # tox diff --git a/pipeline/requirements/tasks/dbt/requirements.in b/pipeline/requirements/tasks/dbt/requirements.in index ea174e9af..344fb97f4 100644 --- a/pipeline/requirements/tasks/dbt/requirements.in +++ b/pipeline/requirements/tasks/dbt/requirements.in @@ -1,2 +1,2 @@ -dbt-core -dbt-postgres \ No newline at end of file +dbt-core~=1.6.1 +dbt-postgres~=1.6.1 \ No newline at end of file diff --git a/pipeline/requirements/tasks/dbt/requirements.txt b/pipeline/requirements/tasks/dbt/requirements.txt index c2ef4427e..bd3798323 100644 --- a/pipeline/requirements/tasks/dbt/requirements.txt +++ b/pipeline/requirements/tasks/dbt/requirements.txt @@ -4,86 +4,102 @@ # # pip-compile --output-file=requirements/tasks/dbt/requirements.txt --resolver=backtracking requirements/tasks/dbt/requirements.in # -agate==1.7.0 - # via dbt-core +agate==1.7.1 + # via + # dbt-core + # dbt-postgres attrs==23.1.0 # via jsonschema babel==2.12.1 # via agate -certifi==2023.5.7 +certifi==2023.7.22 # via requests cffi==1.15.1 # via dbt-core -charset-normalizer==3.1.0 +charset-normalizer==3.2.0 # via requests -click==8.1.3 - # via dbt-core +click==8.1.7 + # via + # dbt-core + # dbt-semantic-interfaces colorama==0.4.6 # via dbt-core -dbt-core==1.5.1 +dbt-core==1.6.1 # via # -r requirements/tasks/dbt/requirements.in # dbt-postgres dbt-extractor==0.4.1 # via dbt-core -dbt-postgres==1.5.1 +dbt-postgres==1.6.1 # via -r requirements/tasks/dbt/requirements.in -future==0.18.3 - # via parsedatetime +dbt-semantic-interfaces==0.2.0 + # via dbt-core hologram==0.0.16 # via dbt-core idna==3.4 # via # dbt-core # requests +importlib-metadata==6.8.0 + # via dbt-semantic-interfaces isodate==0.6.1 # via # agate # dbt-core jinja2==3.1.2 - # via dbt-core -jsonschema==4.17.3 - # via hologram + # via + # dbt-core + # dbt-semantic-interfaces +jsonschema==3.2.0 + # via + # dbt-semantic-interfaces + # hologram leather==0.3.4 # via agate logbook==1.5.3 # via dbt-core markupsafe==2.1.3 - # via - # jinja2 - # werkzeug -mashumaro[msgpack]==3.6 + # via jinja2 +mashumaro[msgpack]==3.8.1 # via dbt-core minimal-snowplow-tracker==0.0.2 # via dbt-core +more-itertools==8.14.0 + # via dbt-semantic-interfaces msgpack==1.0.5 # via mashumaro -networkx==2.8.8 +networkx==3.1 # via dbt-core packaging==23.1 # via dbt-core -parsedatetime==2.4 +parsedatetime==2.6 # via agate -pathspec==0.11.1 +pathspec==0.11.2 # via dbt-core -protobuf==4.23.2 +protobuf==4.24.2 # via dbt-core -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 # via dbt-postgres pycparser==2.21 # via cffi +pydantic==1.10.12 + # via dbt-semantic-interfaces pyrsistent==0.19.3 # via jsonschema python-dateutil==2.8.2 - # via hologram + # via + # dbt-semantic-interfaces + # hologram python-slugify==8.0.1 # via agate pytimeparse==1.1.8 # via agate pytz==2023.3 # via dbt-core -pyyaml==6.0 - # via dbt-core +pyyaml==6.0.1 + # via + # dbt-core + # dbt-semantic-interfaces requests==2.31.0 # via # dbt-core @@ -91,18 +107,26 @@ requests==2.31.0 six==1.16.0 # via # isodate + # jsonschema # leather # minimal-snowplow-tracker # python-dateutil -sqlparse==0.4.3 +sqlparse==0.4.4 # via dbt-core text-unidecode==1.3 # via python-slugify -typing-extensions==4.6.3 +typing-extensions==4.7.1 # via # dbt-core + # dbt-semantic-interfaces # mashumaro -urllib3==2.0.3 - # via requests -werkzeug==2.3.6 - # via dbt-core + # pydantic +urllib3==1.26.16 + # via + # dbt-core + # requests +zipp==3.16.2 + # via importlib-metadata + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/pipeline/requirements/tasks/pipx/requirements.in b/pipeline/requirements/tasks/pipx/requirements.in index 868cf8b3f..89cd6a74c 100644 --- a/pipeline/requirements/tasks/pipx/requirements.in +++ b/pipeline/requirements/tasks/pipx/requirements.in @@ -1 +1 @@ -pipx \ No newline at end of file +pipx~=1.2 \ No newline at end of file diff --git a/pipeline/requirements/tasks/pipx/requirements.txt b/pipeline/requirements/tasks/pipx/requirements.txt index 20105e3b6..87a8d7b7c 100644 --- a/pipeline/requirements/tasks/pipx/requirements.txt +++ b/pipeline/requirements/tasks/pipx/requirements.txt @@ -6,11 +6,11 @@ # argcomplete==3.1.1 # via pipx -click==8.1.3 +click==8.1.7 # via userpath packaging==23.1 # via pipx pipx==1.2.0 # via -r requirements/tasks/pipx/requirements.in -userpath==1.8.0 +userpath==1.9.0 # via pipx diff --git a/pipeline/requirements/tasks/python/requirements.in b/pipeline/requirements/tasks/python/requirements.in index f3950fbcd..9a40fe714 100644 --- a/pipeline/requirements/tasks/python/requirements.in +++ b/pipeline/requirements/tasks/python/requirements.in @@ -1,16 +1,17 @@ apache-airflow-providers-postgres apache-airflow-providers-amazon +beautifulsoup4~=4.12.2 GeoAlchemy2 -geopandas -openpyxl!=3.1.1 -pandas -psycopg2 -pyairtable -requests +geopandas~=0.13.2 +openpyxl~=3.1.2 +pandas~=2.1.0 +psycopg2~=2.9.7 +pyairtable~=2.1 +requests~=2.31 SQLAlchemy -tenacity -tqdm -trafilatura +tenacity~=8.2 +tqdm~=4.66 +trafilatura~=1.6 urllib3 -xlsxwriter \ No newline at end of file +xlsxwriter~=3.1.2 \ No newline at end of file diff --git a/pipeline/requirements/tasks/python/requirements.txt b/pipeline/requirements/tasks/python/requirements.txt index 8a630a21e..56da60a45 100644 --- a/pipeline/requirements/tasks/python/requirements.txt +++ b/pipeline/requirements/tasks/python/requirements.txt @@ -2,17 +2,17 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=requirements/tasks/python/requirements.txt --resolver=backtracking requirements/tasks/python/requirements.in +# pip-compile requirements/tasks/python/requirements.in # -aiohttp==3.8.4 +aiohttp==3.8.5 # via apache-airflow-providers-http aiosignal==1.3.1 # via aiohttp -alembic==1.11.1 +alembic==1.12.0 # via apache-airflow -anyio==3.7.0 +anyio==4.0.0 # via httpcore -apache-airflow==2.6.1 +apache-airflow==2.7.0 # via # apache-airflow-providers-amazon # apache-airflow-providers-common-sql @@ -21,25 +21,27 @@ apache-airflow==2.6.1 # apache-airflow-providers-imap # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-amazon==8.1.0 +apache-airflow-providers-amazon==8.6.0 # via -r requirements/tasks/python/requirements.in -apache-airflow-providers-common-sql==1.5.1 +apache-airflow-providers-common-sql==1.7.1 # via # apache-airflow # apache-airflow-providers-amazon # apache-airflow-providers-postgres # apache-airflow-providers-sqlite -apache-airflow-providers-ftp==3.4.1 +apache-airflow-providers-ftp==3.5.1 # via apache-airflow -apache-airflow-providers-http==4.4.1 - # via apache-airflow -apache-airflow-providers-imap==3.2.1 +apache-airflow-providers-http==4.5.1 + # via + # apache-airflow + # apache-airflow-providers-amazon +apache-airflow-providers-imap==3.3.1 # via apache-airflow -apache-airflow-providers-postgres==5.5.0 +apache-airflow-providers-postgres==5.6.0 # via -r requirements/tasks/python/requirements.in -apache-airflow-providers-sqlite==3.4.1 +apache-airflow-providers-sqlite==3.4.3 # via apache-airflow -apispec[yaml]==5.2.2 +apispec[yaml]==6.3.0 # via flask-appbuilder argcomplete==3.1.1 # via apache-airflow @@ -50,7 +52,7 @@ asgiref==3.7.2 # apache-airflow-providers-http asn1crypto==1.5.1 # via scramp -async-timeout==4.0.2 +async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via @@ -59,19 +61,29 @@ attrs==23.1.0 # cattrs # fiona # jsonschema + # referencing babel==2.12.1 # via flask-babel +backoff==2.2.1 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +backports-datetime-fromisoformat==2.0.0 + # via htmldate beautifulsoup4==4.12.2 - # via redshift-connector + # via + # -r requirements/tasks/python/requirements.in + # redshift-connector blinker==1.6.2 # via apache-airflow -boto3==1.26.151 +boto3==1.28.40 # via # apache-airflow-providers-amazon # redshift-connector # watchtower -botocore==1.29.151 +botocore==1.31.40 # via + # apache-airflow-providers-amazon # boto3 # redshift-connector # s3transfer @@ -81,7 +93,7 @@ cachelib==0.9.0 # flask-session cattrs==23.1.2 # via apache-airflow -certifi==2023.5.7 +certifi==2023.7.22 # via # fiona # httpcore @@ -91,13 +103,13 @@ certifi==2023.5.7 # trafilatura cffi==1.15.1 # via cryptography -charset-normalizer==3.1.0 +charset-normalizer==3.2.0 # via # aiohttp # htmldate # requests # trafilatura -click==8.1.3 +click==8.1.7 # via # click-plugins # clickclick @@ -123,9 +135,9 @@ courlan==0.9.3 # via trafilatura cron-descriptor==1.4.0 # via apache-airflow -croniter==1.3.15 +croniter==1.4.1 # via apache-airflow -cryptography==41.0.1 +cryptography==41.0.3 # via apache-airflow dateparser==1.1.8 # via htmldate @@ -135,9 +147,10 @@ deprecated==1.2.14 # via # apache-airflow # limits -dill==0.3.6 + # opentelemetry-api +dill==0.3.7 # via apache-airflow -dnspython==2.3.0 +dnspython==2.4.2 # via email-validator docutils==0.20.1 # via python-daemon @@ -145,7 +158,7 @@ email-validator==1.3.1 # via flask-appbuilder et-xmlfile==1.1.0 # via openpyxl -exceptiongroup==1.1.1 +exceptiongroup==1.1.3 # via # anyio # cattrs @@ -164,7 +177,7 @@ flask==2.2.5 # flask-session # flask-sqlalchemy # flask-wtf -flask-appbuilder==4.3.0 +flask-appbuilder==4.3.3 # via apache-airflow flask-babel==2.0.0 # via flask-appbuilder @@ -172,7 +185,7 @@ flask-caching==2.0.2 # via apache-airflow flask-jwt-extended==4.5.2 # via flask-appbuilder -flask-limiter==3.3.1 +flask-limiter==3.5.0 # via flask-appbuilder flask-login==0.6.2 # via @@ -186,25 +199,33 @@ flask-wtf==1.1.1 # via # apache-airflow # flask-appbuilder -frozenlist==1.3.3 +frozenlist==1.4.0 # via # aiohttp # aiosignal -geoalchemy2==0.13.3 +geoalchemy2==0.14.1 # via -r requirements/tasks/python/requirements.in geopandas==0.13.2 # via -r requirements/tasks/python/requirements.in +google-re2==1.1 + # via apache-airflow +googleapis-common-protos==1.60.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http graphviz==0.20.1 # via apache-airflow greenlet==2.0.2 # via sqlalchemy -gunicorn==20.1.0 +grpcio==1.57.0 + # via opentelemetry-exporter-otlp-proto-grpc +gunicorn==21.2.0 # via apache-airflow h11==0.14.0 # via httpcore -htmldate==1.4.3 +htmldate==1.5.0 # via trafilatura -httpcore==0.17.2 +httpcore==0.17.3 # via httpx httpx==0.24.1 # via apache-airflow @@ -215,10 +236,12 @@ idna==3.4 # httpx # requests # yarl -importlib-resources==5.12.0 +importlib-resources==6.0.1 # via limits inflection==0.5.1 - # via connexion + # via + # connexion + # pyairtable itsdangerous==2.1.2 # via # apache-airflow @@ -237,18 +260,20 @@ jmespath==1.0.1 # botocore jsonpath-ng==1.5.3 # via apache-airflow-providers-amazon -jsonschema==4.17.3 +jsonschema==4.19.0 # via # apache-airflow # connexion # flask-appbuilder +jsonschema-specifications==2023.7.1 + # via jsonschema justext==3.0.0 # via trafilatura langcodes==3.3.0 # via courlan lazy-object-proxy==1.9.0 # via apache-airflow -limits==3.5.0 +limits==3.6.0 # via flask-limiter linkify-it-py==2.0.2 # via apache-airflow @@ -256,7 +281,7 @@ lockfile==0.12.2 # via # apache-airflow # python-daemon -lxml==4.9.2 +lxml==4.9.3 # via # htmldate # justext @@ -264,9 +289,9 @@ lxml==4.9.2 # trafilatura mako==1.2.4 # via alembic -markdown==3.4.3 +markdown==3.4.4 # via apache-airflow -markdown-it-py==2.2.0 +markdown-it-py==3.0.0 # via # apache-airflow # mdit-py-plugins @@ -278,14 +303,11 @@ markupsafe==2.1.3 # mako # werkzeug # wtforms -marshmallow==3.19.0 +marshmallow==3.20.1 # via # flask-appbuilder - # marshmallow-enum # marshmallow-oneofschema # marshmallow-sqlalchemy -marshmallow-enum==1.5.1 - # via flask-appbuilder marshmallow-oneofschema==3.0.1 # via apache-airflow marshmallow-sqlalchemy==0.26.1 @@ -298,71 +320,99 @@ multidict==6.0.4 # via # aiohttp # yarl -mypy-boto3-appflow==1.26.145 +mypy-boto3-appflow==1.28.38 # via apache-airflow-providers-amazon -mypy-boto3-rds==1.26.144 +mypy-boto3-rds==1.28.36 # via apache-airflow-providers-amazon -mypy-boto3-redshift-data==1.26.109 +mypy-boto3-redshift-data==1.28.36 # via apache-airflow-providers-amazon -mypy-boto3-s3==1.26.127 +mypy-boto3-s3==1.28.36 # via apache-airflow-providers-amazon -numpy==1.24.3 +numpy==1.25.2 # via # pandas # shapely openpyxl==3.1.2 # via -r requirements/tasks/python/requirements.in +opentelemetry-api==1.15.0 + # via + # apache-airflow + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk +opentelemetry-exporter-otlp==1.15.0 + # via apache-airflow +opentelemetry-exporter-otlp-proto-grpc==1.15.0 + # via opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.15.0 + # via opentelemetry-exporter-otlp +opentelemetry-proto==1.15.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.15.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.36b0 + # via opentelemetry-sdk ordered-set==4.1.0 # via flask-limiter packaging==23.1 # via # apache-airflow + # apispec # connexion # geoalchemy2 # geopandas + # gunicorn # limits # marshmallow # redshift-connector # sqlalchemy-redshift -pandas==2.0.2 +pandas==2.1.0 # via # -r requirements/tasks/python/requirements.in # geopandas -pathspec==0.9.0 +pathspec==0.11.2 # via apache-airflow pendulum==2.1.2 # via apache-airflow -pluggy==1.0.0 +pluggy==1.3.0 # via apache-airflow ply==3.11 # via jsonpath-ng prison==0.2.1 # via flask-appbuilder +protobuf==4.24.2 + # via + # googleapis-common-protos + # opentelemetry-proto psutil==5.9.5 # via apache-airflow -psycopg2==2.9.6 +psycopg2==2.9.7 # via -r requirements/tasks/python/requirements.in -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.7 # via apache-airflow-providers-postgres -pyairtable==1.5.0 +pyairtable==2.1.0.post1 # via -r requirements/tasks/python/requirements.in pycparser==2.21 # via cffi -pydantic==1.10.9 - # via apache-airflow -pygments==2.15.1 +pydantic==1.10.12 + # via + # apache-airflow + # pyairtable +pygments==2.16.1 # via # apache-airflow # rich -pyjwt==2.7.0 +pyjwt==2.8.0 # via # apache-airflow # flask-appbuilder # flask-jwt-extended -pyproj==3.5.0 +pyproj==3.6.0 # via geopandas -pyrsistent==0.19.3 - # via jsonschema python-daemon==3.0.1 # via apache-airflow python-dateutil==2.8.2 @@ -381,7 +431,7 @@ python-slugify==8.0.1 # via # apache-airflow # python-nvd3 -pytz==2023.3 +pytz==2023.3.post1 # via # dateparser # flask-babel @@ -389,20 +439,25 @@ pytz==2023.3 # redshift-connector pytzdata==2020.1 # via pendulum -pyyaml==6.0 +pyyaml==6.0.1 # via # apispec # clickclick # connexion -redshift-connector==2.0.911 +redshift-connector==2.0.913 # via apache-airflow-providers-amazon -regex==2023.6.3 +referencing==0.30.2 + # via + # jsonschema + # jsonschema-specifications +regex==2023.8.8 # via dateparser requests==2.31.0 # via # -r requirements/tasks/python/requirements.in # apache-airflow-providers-http # connexion + # opentelemetry-exporter-otlp-proto-http # pyairtable # redshift-connector # requests-toolbelt @@ -410,14 +465,18 @@ requests-toolbelt==1.0.0 # via apache-airflow-providers-http rfc3339-validator==0.1.4 # via apache-airflow -rich==13.4.1 +rich==13.5.2 # via # apache-airflow # flask-limiter # rich-argparse -rich-argparse==1.1.1 +rich-argparse==1.3.0 # via apache-airflow -s3transfer==0.6.1 +rpds-py==0.10.2 + # via + # jsonschema + # referencing +s3transfer==0.6.2 # via boto3 scramp==1.4.4 # via redshift-connector @@ -437,9 +496,9 @@ sniffio==1.3.0 # anyio # httpcore # httpx -soupsieve==2.4.1 +soupsieve==2.5 # via beautifulsoup4 -sqlalchemy==1.4.48 +sqlalchemy==1.4.49 # via # -r requirements/tasks/python/requirements.in # alembic @@ -461,7 +520,7 @@ sqlparse==0.4.4 # via apache-airflow-providers-common-sql tabulate==0.9.0 # via apache-airflow -tenacity==8.2.2 +tenacity==8.2.3 # via # -r requirements/tasks/python/requirements.in # apache-airflow @@ -471,11 +530,11 @@ text-unidecode==1.3 # via python-slugify tld==0.13 # via courlan -tqdm==4.65.0 +tqdm==4.66.1 # via -r requirements/tasks/python/requirements.in trafilatura==1.6.1 # via -r requirements/tasks/python/requirements.in -typing-extensions==4.6.3 +typing-extensions==4.7.1 # via # alembic # apache-airflow @@ -483,6 +542,12 @@ typing-extensions==4.6.3 # cattrs # flask-limiter # limits + # mypy-boto3-appflow + # mypy-boto3-rds + # mypy-boto3-redshift-data + # mypy-boto3-s3 + # opentelemetry-sdk + # pyairtable # pydantic tzdata==2023.3 # via pandas diff --git a/pipeline/scripts/update_schema_seeds.py b/pipeline/scripts/update_schema_seeds.py index 89f96dcb0..5805f8bb6 100644 --- a/pipeline/scripts/update_schema_seeds.py +++ b/pipeline/scripts/update_schema_seeds.py @@ -7,7 +7,7 @@ BASE_URL = os.environ.get( "BASE_URL", - "https://raw.githubusercontent.com/betagouv/data-inclusion-schema/latest/schemas/extra/", # noqa: E501 + "https://raw.githubusercontent.com/gip-inclusion/data-inclusion-schema/latest/schemas/extra/", # noqa: E501 ) OUTPUT_DIR = Path(__file__).parent.parent / "dbt" / "seeds" / "schema" diff --git a/pipeline/src/data_inclusion/scripts/tasks/grist.py b/pipeline/src/data_inclusion/scripts/tasks/grist.py new file mode 100644 index 000000000..9abe72a5d --- /dev/null +++ b/pipeline/src/data_inclusion/scripts/tasks/grist.py @@ -0,0 +1,119 @@ +import logging +import re + +import requests + +logger = logging.getLogger(__name__) + + +def log_and_raise(resp: requests.Response, *args, **kwargs): + try: + resp.raise_for_status() + except requests.HTTPError as err: + logger.error(resp.json()) + raise err + + +class GristClient: + def __init__(self, base_url: str, token: str) -> None: + self.base_url = base_url.rstrip("/") + self.session = requests.Session() + self.session.hooks["response"] = [log_and_raise] + self.session.headers.update({"Authorization": f"Bearer {token}"}) + + def _create_document(self, workspace_id: str, document_name: str) -> str: + return self.session.post( + self.base_url + f"/workspaces/{workspace_id}/docs", + json={"name": document_name}, + ).json() + + def create_document(self, workspace_id: str, document_name: str) -> str: + workspace_dict = self.describe_workspace(workspace_id=workspace_id) + + existing_document_dict = next( + ( + document_dict + for document_dict in workspace_dict["docs"] + if document_dict["name"] == document_name + ), + None, + ) + + if existing_document_dict is not None: + logger.warning( + f"A document with name '{document_name}' already exists in workspace." + ) + return existing_document_dict["id"] + + return self._create_document( + workspace_id=workspace_id, document_name=document_name + ) + + def _create_table(self, document_id: str, table_name: str, columns: list) -> str: + return self.session.post( + self.base_url + f"/docs/{document_id}/tables", + json={"tables": [{"id": table_name, "columns": columns}]}, + ).json()["tables"][0]["id"] + + def list_tables(self, document_id: str) -> list: + return self.session.get( + self.base_url + f"/docs/{document_id}/tables", + ).json()["tables"] + + def create_table(self, document_id: str, table_name: str, columns: list) -> str: + tables_list = self.list_tables(document_id=document_id) + + existing_table_dict = next( + ( + table_dict + for table_dict in tables_list + if table_dict["id"] == table_name + ), + None, + ) + + if existing_table_dict is not None: + logger.warning( + f"A table with name '{table_name}' already exists in document." + ) + return existing_table_dict["id"] + + return self._create_table( + document_id=document_id, table_name=table_name, columns=columns + ) + + def describe_workspace(self, workspace_id: str): + # https://support.getgrist.com/api/#tag/workspaces/paths/~1workspaces~1%7BworkspaceId%7D/get + return self.session.get(self.base_url + f"/workspaces/{workspace_id}").json() + + def download_table_content_as_csv(self, document_id: str, table_id: str) -> bytes: + # https://support.getgrist.com/api/#tag/docs/paths/~1docs~1%7BdocId%7D~1download~1csv/get + return self.session.get( + self.base_url + f"/docs/{document_id}/download/csv", + params={"tableId": table_id}, + ).content + + def add_records(self, document_id: str, table_id: str, records: list): + # https://support.getgrist.com/api/#tag/records/paths/~1docs~1%7BdocId%7D~1tables~1%7BtableId%7D~1records/post + return self.session.post( + self.base_url + f"/docs/{document_id}/tables/{table_id}/records", + json={"records": records}, + ) + + +def extract(url: str, token: str, **kwargs) -> bytes: + match = re.search( + r"(?P.+)/docs/(?P\w+)/download/csv\?.*tableId=(?P\w+)", # noqa: E501 + url, + ) + + if match is None: + raise Exception("Invalid url") + + base_url, document_id, table_id = match.groups() + + grist_client = GristClient(base_url=base_url, token=token) + + return grist_client.download_table_content_as_csv( + document_id=document_id, table_id=table_id + ) diff --git a/pipeline/src/data_inclusion/scripts/tasks/mediation_numerique.py b/pipeline/src/data_inclusion/scripts/tasks/mediation_numerique.py index 45a102f00..8a570c5de 100644 --- a/pipeline/src/data_inclusion/scripts/tasks/mediation_numerique.py +++ b/pipeline/src/data_inclusion/scripts/tasks/mediation_numerique.py @@ -13,7 +13,7 @@ def get_resources_url_from_dataset_url(dataset_url: str) -> dict[str, str]: data_inclusion_resources = [ resource_data for resource_data in dataset_data["resources"] - if resource_data["schema"]["name"] == "betagouv/data-inclusion-schema" + if resource_data["schema"]["name"] == "gip-inclusion/data-inclusion-schema" ] # identify urls based on resource titles diff --git a/pipeline/src/data_inclusion/scripts/tasks/mes_aides.py b/pipeline/src/data_inclusion/scripts/tasks/mes_aides.py index bbb34ed85..c0c1e9022 100644 --- a/pipeline/src/data_inclusion/scripts/tasks/mes_aides.py +++ b/pipeline/src/data_inclusion/scripts/tasks/mes_aides.py @@ -10,7 +10,8 @@ def extract(url: str, token: str, **kwargs) -> bytes: base_id = url.split("/")[-3] table_name = url.split("/")[-2] - table = pyairtable.Table(api_key=token, base_id=base_id, table_name=table_name) + api = pyairtable.Api(api_key=token) + table = api.table(base_id=base_id, table_name=table_name) data = table.all() with io.StringIO() as buf: diff --git a/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py b/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py new file mode 100644 index 000000000..672c702bf --- /dev/null +++ b/pipeline/src/data_inclusion/scripts/tasks/reseau_alpha.py @@ -0,0 +1,300 @@ +import io +import json +import logging +import tarfile +import time +from pathlib import Path +from typing import Optional + +import bs4 +import numpy as np +import pandas as pd +import requests +import trafilatura +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +def log_and_raise(resp: requests.Response, *args, **kwargs): + try: + resp.raise_for_status() + except requests.HTTPError as err: + logger.error(resp.json()) + raise err + + +def extract_structures(url: str, **kwargs) -> bytes: + url = url.lstrip("/") + + session = requests.Session() + session.hooks["response"] = [log_and_raise] + + response = session.get(url + "/cartographie.json") + data = response.json() + + structures_df = pd.DataFrame.from_records(data["structures"]) + + with io.BytesIO() as out_buf: + with tarfile.open(fileobj=out_buf, mode="w:gz") as tar: + with io.BytesIO(response.content) as buf: + tar_info = tarfile.TarInfo("metadata.json") + tar_info.size = len(response.content) + tar.addfile(tar_info, buf) + + for _, row in tqdm(structures_df.iterrows()): + response = session.get(row.url) + + with io.BytesIO(response.content) as buf: + tar_info = tarfile.TarInfo(f"{row.id}.html") + tar_info.size = len(response.content) + tar.addfile(tar_info, buf) + + time.sleep(0.1) + return out_buf.getvalue() + + +def extract_formations(url: str, **kwargs) -> bytes: + url = url.lstrip("/") + + session = requests.Session() + session.hooks["response"] = [log_and_raise] + + response = session.get(url + "/cartographie.json") + data = response.json() + + formations_df = pd.json_normalize( + data["structures"], + record_path="formations", + meta="id", + meta_prefix="structure_", + max_level=0, + ) + + with io.BytesIO() as out_buf: + with tarfile.open(fileobj=out_buf, mode="w:gz") as tar: + with io.BytesIO(response.content) as buf: + tar_info = tarfile.TarInfo("metadata.json") + tar_info.size = len(response.content) + tar.addfile(tar_info, buf) + + for _, row in tqdm(formations_df.iterrows()): + response = session.get(row.url) + + with io.BytesIO(response.content) as buf: + tar_info = tarfile.TarInfo(f"{row.id}.html") + tar_info.size = len(response.content) + tar.addfile(tar_info, buf) + + time.sleep(0.1) + return out_buf.getvalue() + + +def scrap_structure_html(html_path: Path) -> dict: + with html_path.open() as f: + soup = bs4.BeautifulSoup(f, features="lxml") + data = {} + + NODE_BY_CONTENT_NAME = { + "adresse": soup.select_one(".adresse"), + "date_maj": soup.find(class_="structures-dates").find( + string=lambda text: "Date de la dernière modification :" in text + ), + "telephone": soup.select_one(".telephone > a"), + "site_web": soup.select_one(".contact-content").find( + string=lambda t: t.startswith("http://") + ), + "courriel": soup.select_one(".email > a:nth-child(1)"), + } + + for content_name, node in NODE_BY_CONTENT_NAME.items(): + data[f"content__{content_name}"] = html_to_markdown(node) + + return data + + +def scrap_formation_html(html_path: Path) -> dict: + def get_parent(node): + return node.parent if node is not None else None + + with html_path.open() as f: + soup = bs4.BeautifulSoup(f, features="lxml") + data = {} + + contenu_et_objectifs_selector = ( + ".container > .row:nth-child(2) > div:nth-child(1) > .row:nth-child(1)" + ) + public_attendu_selector = ( + ".container > .row:nth-child(2) > div:nth-child(1) > .row:nth-child(2)" + ) + inscription_selector = ( + ".container > .row:nth-child(2) > div:nth-child(2) > .row:nth-child(1)" + ) + informations_pratiques_selector = ( + ".container > .row:nth-child(2) > div:nth-child(2) > .row:nth-child(3)" + ) + + NODE_BY_CONTENT_NAME = { + "date_maj": soup.select_one(".entete").find( + string=lambda text: "Date de la dernière modification :" in text + ), + "contenu_et_objectifs__titre": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(2)" + ), + "contenu_et_objectifs__objectifs": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(3)" + ), + "contenu_et_objectifs__niveau": soup.select_one( + f"{contenu_et_objectifs_selector} > div:nth-child(4)" + ), + "public_attendu__niveau": soup.select_one( + f"{public_attendu_selector} > div:nth-child(2)" + ), + "public_attendu__competences": soup.select_one( + f"{public_attendu_selector} > div:nth-child(3)" + ), + "public_attendu__type_de_public": soup.select_one( + f"{public_attendu_selector} > div:nth-child(4)" + ), + "inscription__informations_en_ligne": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Informations en ligne" in text + ) + ) + ), + "inscription__places": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Places disponibles" in text + ) + ) + ), + "inscription__entree_sortie": get_parent( + get_parent( + soup.select_one(inscription_selector).find( + string=lambda text: "Entrée / sortie permanente" in text + ) + ) + ), + "contact_inscription__adresse": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-home")) + ), + "contact_inscription__contact": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-user")) + ), + "contact_inscription__telephone": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-phone")) + ), + "contact_inscription__courriel": get_parent( + get_parent(soup.select_one("#formation-inscription .fa-inbox")) + ), + "informations_pratiques__etendue": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Étendue de la formation" in text + ) + ) + ), + "informations_pratiques__volume": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Volume horaire" in text + ) + ) + ), + "informations_pratiques__cout": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: ( + "Adhésion annuelle à la structure obligatoire" + ) + in text + ) + ) + ), + "informations_pratiques__prise_en_charge": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Coût d'inscription à la formation" in text + ) + ) + ), + "informations_pratiques__remuneration": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Rémunération" in text + ) + ) + ), + "informations_pratiques__garde": get_parent( + get_parent( + soup.select_one(informations_pratiques_selector).find( + string=lambda text: "Garde d'enfant" in text + ) + ) + ), + "lieux_et_horaires_formation__adresse": soup.select_one( + "#lieux-formation .lieu-formation .adresse" + ), + "lieux_et_horaires_formation__horaires": "\n".join( + soup.select_one("#lieux-formation").find_all( + string=lambda text: "de" in text and "à" in text + ) + ), + } + + for content_name, node in NODE_BY_CONTENT_NAME.items(): + data[f"content__{content_name}"] = html_to_markdown(node) + + return data + + +def html_to_markdown(s) -> Optional[str]: + if s is None or s == "": + return s + return trafilatura.extract(trafilatura.load_html("" + str(s) + "")) + + +def read_structures(path: Path) -> pd.DataFrame: + with tarfile.open(path, "r:gz") as tar: + tar.extractall(path=path.parent) + + with (path.parent / "metadata.json").open() as f: + df = pd.DataFrame.from_records(json.load(f)["structures"]) + + df = df.join( + df.apply( + lambda row: scrap_structure_html(html_path=path.parent / f"{row.id}.html"), + axis=1, + result_type="expand", + ) + ) + df = df.replace({np.nan: None}) + + return df + + +def read_formations(path: Path) -> pd.DataFrame: + with tarfile.open(path, "r:gz") as tar: + tar.extractall(path=path.parent) + + with (path.parent / "metadata.json").open() as f: + df = pd.json_normalize( + json.load(f)["structures"], + record_path="formations", + meta="id", + meta_prefix="structure_", + max_level=0, + ) + + df = df.join( + df.apply( + lambda row: scrap_formation_html(html_path=path.parent / f"{row.id}.html"), + axis=1, + result_type="expand", + ) + ) + df = df.replace({np.nan: None}) + + return df diff --git a/pipeline/src/data_inclusion/scripts/tasks/soliguide.py b/pipeline/src/data_inclusion/scripts/tasks/soliguide.py index af3b2150a..029eb3046 100644 --- a/pipeline/src/data_inclusion/scripts/tasks/soliguide.py +++ b/pipeline/src/data_inclusion/scripts/tasks/soliguide.py @@ -110,7 +110,6 @@ def html_to_markdown(s: Optional[str]) -> Optional[str]: def read(path: Path) -> pd.DataFrame: - # utils.read_json is enough # but this adds the conversion of descriptions from html to markdown # should eventually be implemented as a python dbt model diff --git a/siretisation/django/annotation/migrations/0001_initial.py b/siretisation/django/annotation/migrations/0001_initial.py index a020dbc5f..da887ce09 100644 --- a/siretisation/django/annotation/migrations/0001_initial.py +++ b/siretisation/django/annotation/migrations/0001_initial.py @@ -7,7 +7,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [ diff --git a/siretisation/django/annotation/migrations/0002_annotation_closed_annotation_irrelevant_and_more.py b/siretisation/django/annotation/migrations/0002_annotation_closed_annotation_irrelevant_and_more.py index 789ca3c0f..8ddad4348 100644 --- a/siretisation/django/annotation/migrations/0002_annotation_closed_annotation_irrelevant_and_more.py +++ b/siretisation/django/annotation/migrations/0002_annotation_closed_annotation_irrelevant_and_more.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0001_initial"), ] diff --git a/siretisation/django/annotation/migrations/0003_annotation_created_by.py b/siretisation/django/annotation/migrations/0003_annotation_created_by.py index 54e8ae34f..eeca94094 100644 --- a/siretisation/django/annotation/migrations/0003_annotation_created_by.py +++ b/siretisation/django/annotation/migrations/0003_annotation_created_by.py @@ -6,7 +6,6 @@ class Migration(migrations.Migration): - dependencies = [ migrations.swappable_dependency(settings.AUTH_USER_MODEL), ("annotation", "0002_annotation_closed_annotation_irrelevant_and_more"), diff --git a/siretisation/django/annotation/migrations/0004_dataset_slug.py b/siretisation/django/annotation/migrations/0004_dataset_slug.py index 488d5716f..5780835c4 100644 --- a/siretisation/django/annotation/migrations/0004_dataset_slug.py +++ b/siretisation/django/annotation/migrations/0004_dataset_slug.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0003_annotation_created_by"), ] diff --git a/siretisation/django/annotation/migrations/0005_enable_unaccent.py b/siretisation/django/annotation/migrations/0005_enable_unaccent.py index 5072f937c..cb8772e04 100644 --- a/siretisation/django/annotation/migrations/0005_enable_unaccent.py +++ b/siretisation/django/annotation/migrations/0005_enable_unaccent.py @@ -3,7 +3,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0004_dataset_slug"), ] diff --git a/siretisation/django/annotation/migrations/0006_ds_priority_settings.py b/siretisation/django/annotation/migrations/0006_ds_priority_settings.py index 2e5cb16c4..6fffcfdcc 100644 --- a/siretisation/django/annotation/migrations/0006_ds_priority_settings.py +++ b/siretisation/django/annotation/migrations/0006_ds_priority_settings.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0005_enable_unaccent"), ] diff --git a/siretisation/django/annotation/migrations/0007_datasetrow_similar_address.py b/siretisation/django/annotation/migrations/0007_datasetrow_similar_address.py index f1daf4b76..bcd03efd6 100644 --- a/siretisation/django/annotation/migrations/0007_datasetrow_similar_address.py +++ b/siretisation/django/annotation/migrations/0007_datasetrow_similar_address.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0006_ds_priority_settings"), ] diff --git a/siretisation/django/annotation/migrations/0008_dataset_show_nearby_cnfs_permanences_and_more.py b/siretisation/django/annotation/migrations/0008_dataset_show_nearby_cnfs_permanences_and_more.py index d67faa837..85e03f3e2 100644 --- a/siretisation/django/annotation/migrations/0008_dataset_show_nearby_cnfs_permanences_and_more.py +++ b/siretisation/django/annotation/migrations/0008_dataset_show_nearby_cnfs_permanences_and_more.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0007_datasetrow_similar_address"), ] diff --git a/siretisation/django/annotation/migrations/0009_source.py b/siretisation/django/annotation/migrations/0009_source.py index 77d03fee3..4e932fba3 100644 --- a/siretisation/django/annotation/migrations/0009_source.py +++ b/siretisation/django/annotation/migrations/0009_source.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("annotation", "0008_dataset_show_nearby_cnfs_permanences_and_more"), ] diff --git a/siretisation/django/annotation/migrations/0010_data_from_warehouse.py b/siretisation/django/annotation/migrations/0010_data_from_warehouse.py index 9494f83e7..29e135c3b 100644 --- a/siretisation/django/annotation/migrations/0010_data_from_warehouse.py +++ b/siretisation/django/annotation/migrations/0010_data_from_warehouse.py @@ -15,7 +15,6 @@ def migrate_data(apps, _) -> None: class Migration(migrations.Migration): - dependencies = [ ("annotation", "0009_source"), ] diff --git a/siretisation/django/cnfs/migrations/0001_add_cnfs_permanences.py b/siretisation/django/cnfs/migrations/0001_add_cnfs_permanences.py index 5aaf4a798..966cbb631 100644 --- a/siretisation/django/cnfs/migrations/0001_add_cnfs_permanences.py +++ b/siretisation/django/cnfs/migrations/0001_add_cnfs_permanences.py @@ -7,7 +7,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [] diff --git a/siretisation/django/matching/migrations/0001_initial.py b/siretisation/django/matching/migrations/0001_initial.py index e44910bf2..ba338a80b 100644 --- a/siretisation/django/matching/migrations/0001_initial.py +++ b/siretisation/django/matching/migrations/0001_initial.py @@ -1,14 +1,14 @@ # Generated by Django 4.1.3 on 2023-01-31 18:34 -from django.conf import settings +import uuid + import django.contrib.postgres.fields -from django.db import migrations, models import django.db.models.deletion -import uuid +from django.conf import settings +from django.db import migrations, models class Migration(migrations.Migration): - initial = True dependencies = [ diff --git a/siretisation/django/matching/migrations/0002_add_no_matching_row_flag.py b/siretisation/django/matching/migrations/0002_add_no_matching_row_flag.py index 3f939c957..cded5f515 100644 --- a/siretisation/django/matching/migrations/0002_add_no_matching_row_flag.py +++ b/siretisation/django/matching/migrations/0002_add_no_matching_row_flag.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("matching", "0001_initial"), ] diff --git a/siretisation/django/sirene/migrations/0001_initial.py b/siretisation/django/sirene/migrations/0001_initial.py index 1e073dfe7..5364f1928 100644 --- a/siretisation/django/sirene/migrations/0001_initial.py +++ b/siretisation/django/sirene/migrations/0001_initial.py @@ -6,7 +6,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [] diff --git a/siretisation/django/sirene/migrations/0002_remove_establishment_full_text_trgm_idx_and_more.py b/siretisation/django/sirene/migrations/0002_remove_establishment_full_text_trgm_idx_and_more.py index 99d39e699..24d5ceb6c 100644 --- a/siretisation/django/sirene/migrations/0002_remove_establishment_full_text_trgm_idx_and_more.py +++ b/siretisation/django/sirene/migrations/0002_remove_establishment_full_text_trgm_idx_and_more.py @@ -5,7 +5,6 @@ class Migration(migrations.Migration): - dependencies = [ ("sirene", "0001_initial"), ] diff --git a/siretisation/django/sirene/migrations/0003_add_postgis.py b/siretisation/django/sirene/migrations/0003_add_postgis.py index 4efa86317..b699fb484 100644 --- a/siretisation/django/sirene/migrations/0003_add_postgis.py +++ b/siretisation/django/sirene/migrations/0003_add_postgis.py @@ -3,7 +3,6 @@ class Migration(migrations.Migration): - dependencies = [ ("sirene", "0002_remove_establishment_full_text_trgm_idx_and_more"), ] diff --git a/siretisation/django/sirene/migrations/0004_add_codenaf_table.py b/siretisation/django/sirene/migrations/0004_add_codenaf_table.py index 5e945f03d..8273e5c38 100644 --- a/siretisation/django/sirene/migrations/0004_add_codenaf_table.py +++ b/siretisation/django/sirene/migrations/0004_add_codenaf_table.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("sirene", "0003_add_postgis"), ] diff --git a/siretisation/django/sirene/migrations/0005_import_codenaf.py b/siretisation/django/sirene/migrations/0005_import_codenaf.py index 61c8a5c5c..6d1b2353c 100644 --- a/siretisation/django/sirene/migrations/0005_import_codenaf.py +++ b/siretisation/django/sirene/migrations/0005_import_codenaf.py @@ -78,7 +78,6 @@ def import_naf(apps, _) -> None: class Migration(migrations.Migration): - dependencies = [ ("sirene", "0004_add_codenaf_table"), ] diff --git a/siretisation/django/sirene/migrations/0006_alter_establishment_ape.py b/siretisation/django/sirene/migrations/0006_alter_establishment_ape.py index e80dcd812..33b6f3286 100644 --- a/siretisation/django/sirene/migrations/0006_alter_establishment_ape.py +++ b/siretisation/django/sirene/migrations/0006_alter_establishment_ape.py @@ -6,7 +6,6 @@ class Migration(migrations.Migration): - dependencies = [ ("sirene", "0005_import_codenaf"), ] diff --git a/siretisation/django/users/migrations/0001_initial.py b/siretisation/django/users/migrations/0001_initial.py index dfdf49fcb..906102260 100644 --- a/siretisation/django/users/migrations/0001_initial.py +++ b/siretisation/django/users/migrations/0001_initial.py @@ -5,7 +5,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [ @@ -41,7 +40,7 @@ class Migration(migrations.Migration): "is_active", models.BooleanField( default=True, - help_text="Designates whether this user should be treated as active. Unselect this instead of deleting accounts.", + help_text="Designates whether this user should be treated as active. Unselect this instead of deleting accounts.", # noqa: E501 verbose_name="active", ), ), @@ -51,7 +50,7 @@ class Migration(migrations.Migration): "groups", models.ManyToManyField( blank=True, - help_text="The groups this user belongs to. A user will get all permissions granted to each of their groups.", + help_text="The groups this user belongs to. A user will get all permissions granted to each of their groups.", # noqa: E501 related_name="user_set", related_query_name="user", to="auth.group",