diff --git a/analyse/.template.env b/analyse/.template.env deleted file mode 100644 index bdd2044a..00000000 --- a/analyse/.template.env +++ /dev/null @@ -1,12 +0,0 @@ -SIRENE_DATABASE_URL= -SOLIGUIDE_API_TOKEN= -MES_AIDES_API_KEY= -ITOU_API_TOKEN= -ODSPEP_FILE_URL= -SIAO_FILE_URL= -FINESS_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/3dc9b1d5-0157-440d-a7b5-c894fcfdfd45 -CD72_FILE_URL= -CD93_FILE_URL= -CD35_FILE_URL=https://data.ille-et-vilaine.fr/dataset/8d5ec0f0-ebe1-442d-9d99-655b37d5ad07/resource/665776ae-fa25-46ab-9bfd-c4241866f03f/download/annuaire_sociale_fixe.csv -CD62_FILE_URL= -RESEAU_ALPHA_TEST_W_LOCAL_FILES=0 \ No newline at end of file diff --git a/analyse/Makefile b/analyse/Makefile new file mode 100644 index 00000000..9be63f10 --- /dev/null +++ b/analyse/Makefile @@ -0,0 +1,12 @@ +PIP_COMPILE := pipx run uv pip compile pyproject.toml --quiet + +ifeq ($(filter upgrade,$(MAKECMDGOALS)),upgrade) +PIP_COMPILE += --upgrade +endif + +.PHONY: all base uv upgrade + +all: base + +base: + $(PIP_COMPILE) --output-file=requirements/requirements.txt diff --git a/analyse/README.md b/analyse/README.md index 3e3f68a7..1de30782 100644 --- a/analyse/README.md +++ b/analyse/README.md @@ -1,35 +1,14 @@ -# `data-inclusion-analyse` - -Analyses des jeux de données des partenaires de data.inclusion - -## Analyses - -| Partenaire | Jeu de données | Date | Notebook | -| --------------- | ------------------------ | ---------- | ------------------------------------------------------------ | -| 1jeune1solution | benefits | (api) | [Notebook](./notebooks/1j1s/benefits.ipynb) | -| cd35 | annuaire social | (api) | [Notebook](./notebooks/cd35/annuaire_social.ipynb) | -| cd62 | wikisol 62 | 24/05/2022 | [Notebook](./notebooks/cd62/analyse-cd62.ipynb) | -| odspep | ressources partenariales | 14/06/2022 | [Notebook](./notebooks/odspep/analyse.ipynb) | -| siao | base siao | 26/07/2022 | [Notebook](./notebooks/siao/analyse.ipynb) | -| soliguide | lieux et services | (api) | [Notebook](./notebooks/soliguide/analyse.ipynb) | -| Mes Aides | garages solidaires | (api) | [Notebook](./notebooks/garages_solidaires/analyse-gs.ipynb) | -| FINESS | finess | (api) | [Notebook](./notebooks/finess/analyse.ipynb) | -| Etab. Publics | etablissements publics | (api) | [Notebook](./notebooks/etablissements-publics/Analyse.ipynb) | -| cd93 | organismes de formation | 01/10/2022 | [Notebook](./notebooks/cd93/analyse.ipynb) | +# `analyse` ## Contribuer ```bash -# Create a new virtualenv in the project's root directory -python3 -m venv .venv --prompt analyse - -# Activate the environment +# init virtual env +python -m venv .venv source .venv/bin/activate pip install -U pip setuptools wheel +pip install -r requirements/requirements.txt -# Install dependencies -pip install -r requirements.txt - -# Setup hook to clean notebook outputs +# setup hook to clean notebook outputs git config --local include.path ../analyse/.gitconfig ``` diff --git a/analyse/notebooks/1j1s/benefits.ipynb b/analyse/notebooks/1j1s/benefits.ipynb deleted file mode 100644 index 9b9a9016..00000000 --- a/analyse/notebooks/1j1s/benefits.ipynb +++ /dev/null @@ -1,246 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "40256f98", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import requests\n", - "from typing import Dict, List" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "124231e1", - "metadata": {}, - "outputs": [], - "source": [ - "class APIClient:\n", - " def __init__(self, base_url: str):\n", - " self.base_url = base_url\n", - "\n", - " def list_benefits(self) -> List[Dict]:\n", - " return requests.get(self.base_url + \"/benefits\").json()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae9c9806", - "metadata": {}, - "outputs": [], - "source": [ - "data = APIClient(base_url=\"https://mes-aides.1jeune1solution.beta.gouv.fr/api/\").list_benefits()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7cabeb76", - "metadata": {}, - "outputs": [], - "source": [ - "benefits_df = pd.DataFrame.from_dict(data)\n", - "benefits_df = benefits_df.set_index(\"id\")\n", - "benefits_df = benefits_df.replace(\"\", None)\n", - "institutions_df = pd.json_normalize(benefits_df.institution, sep=\"_\")\n", - "institutions_df = institutions_df.replace(\"\", None)\n", - "institutions_df = institutions_df.drop_duplicates(subset=[\"id\"])\n", - "institutions_df = institutions_df.set_index(\"id\")" - ] - }, - { - "cell_type": "markdown", - "id": "8f2f4003", - "metadata": {}, - "source": [ - "### Quantités de données" - ] - }, - { - "cell_type": "markdown", - "id": "5a0ee036", - "metadata": {}, - "source": [ - "#### Nombre de benefits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86ad3862", - "metadata": {}, - "outputs": [], - "source": [ - "benefits_df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "id": "1a54eaae", - "metadata": {}, - "source": [ - "#### Nombre d'institutions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "baac972f", - "metadata": {}, - "outputs": [], - "source": [ - "institutions_df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "id": "180a58f1", - "metadata": {}, - "source": [ - "### Institutions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c8924d5", - "metadata": {}, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)" - ] - }, - { - "cell_type": "markdown", - "id": "523cb1bb", - "metadata": {}, - "source": [ - "#### Institutions - Taux de remplissage des champs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "385d1932", - "metadata": {}, - "outputs": [], - "source": [ - "compute_field_occupancy_rates(institutions_df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "a5b8a8e7", - "metadata": {}, - "source": [ - "#### Institutions - Types de structures\n", - "\n", - "* le champs `type` doit être utilisé pour mapper le type de structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a6f6e33", - "metadata": {}, - "outputs": [], - "source": [ - "institutions_df.type.value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "262c6887", - "metadata": {}, - "source": [ - "### Benefits" - ] - }, - { - "cell_type": "markdown", - "id": "620a7728", - "metadata": {}, - "source": [ - "#### Benefits - Taux de remplissage des champs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57e8b41c", - "metadata": {}, - "outputs": [], - "source": [ - "compute_field_occupancy_rates(benefits_df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "4d1661ba", - "metadata": {}, - "source": [ - "#### Benefits - Profils" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8f9722fd", - "metadata": {}, - "outputs": [], - "source": [ - "benefits_df.profils.value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "da4b954b", - "metadata": {}, - "source": [ - "#### Benefits - Conditions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9acf9b1b", - "metadata": {}, - "outputs": [], - "source": [ - "benefits_df.conditions.value_counts().to_frame()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/agefiph/api-services.ipynb b/analyse/notebooks/agefiph/api-services.ipynb deleted file mode 100644 index e7d43b09..00000000 --- a/analyse/notebooks/agefiph/api-services.ipynb +++ /dev/null @@ -1,276 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "45495455", - "metadata": {}, - "source": [ - "Le but de ce notebook est de générer des structures et services pour l'offre de service de l'agefiph.\n", - "\n", - "Ce notebook prend 2 sources de données :\n", - "\n", - "* une liste de strucures issu d'un tableau grist maintenu par data.inclusion,\n", - "* une liste de services issu de l'api de l'agefiph.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "517ed275", - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import numpy as np\n", - "import pandas as pd\n", - "import trafilatura\n", - "import hashlib\n", - "from uuid import UUID\n", - "\n", - "\n", - "def html_to_markdown(s: str):\n", - " if s is None or s == \"\":\n", - " return s\n", - " return trafilatura.extract(trafilatura.load_html(\"\" + s + \"\"))\n", - "\n", - "\n", - "# https://grist.incubateur.net/o/datainclusion/bWqnEafQaLgc/Partage-de-donnes-AGEFIPH-Mars-Avril-2023/p/4\n", - "STRUCTURES_TABLE_URL = \"https://grist.incubateur.net/o/datainclusion/api/docs/bWqnEafQaLgcTvFv7rv6hF/download/csv?tableId=Structures\"\n", - "SERVICES_API_URL = \"https://www.agefiph.fr/jsonapi/node/aide_service\"\n", - "\n", - "HEADERS = {\"User-Agent\": \"data-inclusion\"}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d20e16b", - "metadata": {}, - "outputs": [], - "source": [ - "raw_structures_df = pd.read_csv(STRUCTURES_TABLE_URL, dtype=str).replace([np.nan, \"\"], None)\n", - "raw_services_df = pd.json_normalize(\n", - " requests.get(SERVICES_API_URL, headers=HEADERS).json()[\"data\"]\n", - ").replace([np.nan, \"\"], None)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15a0ca22", - "metadata": {}, - "outputs": [], - "source": [ - "raw_structures_df = raw_structures_df\n", - "\n", - "raw_structures_df.info()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7bd7e018", - "metadata": {}, - "outputs": [], - "source": [ - "raw_services_df = raw_services_df[\n", - " [\n", - " \"id\",\n", - " \"attributes.created\",\n", - " \"attributes.changed\",\n", - " \"attributes.title\",\n", - " \"attributes.field_titre_card_employeur\",\n", - " \"attributes.field_essentiel_ph.processed\",\n", - " \"attributes.field_essentiel_employeur.processed\",\n", - " \"attributes.field_texte_brut_long\",\n", - " \"relationships.field_thematique.data\",\n", - " ]\n", - "]\n", - "\n", - "raw_services_df.info()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5ad9896", - "metadata": {}, - "outputs": [], - "source": [ - "pd.json_normalize(\n", - " raw_services_df.rename(columns={\"id\": \"service_id\"}).to_dict(orient=\"records\"),\n", - " record_path=\"relationships.field_thematique.data\",\n", - " meta=\"service_id\",\n", - ").info()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2ad9881", - "metadata": {}, - "outputs": [], - "source": [ - "DI_THEMATIQUES_BY_AGEFIPH_THEMATIQUE_ID = {\n", - " # Source https://grist.incubateur.net/o/datainclusion/uVsB8pabQGoe/Thmatiques/p/13\n", - " \"4e08047f-b0ed-431a-9182-61e8e61b1486\": \"handicap--favoriser-le-retour-et-le-maintien-dans-lemploi\",\n", - " \"11618ce3-e59b-404f-8eb2-5763215464f2\": \"handicap--favoriser-le-retour-et-le-maintien-dans-lemploi\",\n", - " \"60c25ci7-61sc-89a9-ny54-126hslf808a2\": \"handicap--connaissance-des-droits-des-travailleurs\",\n", - " \"51be0003-13d8-4ffa-9923-248e7aa4a227\": None,\n", - " \"ddf0fa87-2ee0-481c-a258-96985b7826c3\": None,\n", - " \"cb2c9fec-c190-4e2f-aeee-6da818109bf8\": \"handicap--favoriser-le-retour-et-le-maintien-dans-lemploi\",\n", - " \"78b28acb-803e-4b06-ab77-58dabfbd8571\": \"handicap--adaptation-au-poste-de-travail\",\n", - " \"366eb399-1e6c-4609-8066-d1504fae2a8e\": None,\n", - " \"907a8c33-5c56-49d3-bd64-a736a9ceac76\": None,\n", - " \"5d8c88d8-db03-4f27-b517-d7016896b01a\": None,\n", - " \"fb5e6180-290b-4216-ba68-624d25defa3a\": \"handicap--favoriser-le-retour-et-le-maintien-dans-lemploi\",\n", - " \"03228d62-2a59-49d8-8443-b25cb2e684b9\": \"accompagnement-social-et-professionnel-personnalise--definition-du-projet-professionnel\",\n", - " \"f9ab3e06-af51-463a-aaf7-7b04a28e047f\": \"se-former--trouver-sa-formation\",\n", - " \"aeab1d68-4e89-4e2a-a612-d8645e3999d8\": \"creation-activite--definir-son-projet-de-creation-dentreprise\",\n", - " \"f4551558-8315-4708-8357-5ecc89751bc6\": \"handicap--faire-reconnaitre-un-handicap\",\n", - " \"4b8b0473-52c2-4a21-956d-d7d68a7053b5\": None,\n", - "}\n", - "\n", - "\n", - "def map_service(row) -> dict:\n", - " service = {}\n", - " service[\"id\"] = row[\"id\"]\n", - " service[\"date_creation\"] = row[\"attributes.created\"]\n", - " service[\"date_maj\"] = row[\"attributes.changed\"]\n", - " service[\"nom\"] = row[\"attributes.title\"]\n", - " service[\"contact_public\"] = True\n", - " service[\"presentation_resume\"] = row[\"attributes.field_titre_card_employeur\"]\n", - "\n", - " service[\"presentation_detail\"] = \"\"\n", - " if row[\"attributes.field_essentiel_ph.processed\"] is not None:\n", - " service[\"presentation_detail\"] += (\n", - " \"

Pour la personne handicapée :

\"\n", - " + row[\"attributes.field_essentiel_ph.processed\"]\n", - " )\n", - " if row[\"attributes.field_essentiel_employeur.processed\"] is not None:\n", - " service[\"presentation_detail\"] += (\n", - " \"

Pour l'employeur :

\"\n", - " + row[\"attributes.field_essentiel_employeur.processed\"]\n", - " )\n", - " if row[\"attributes.field_texte_brut_long\"] is not None:\n", - " service[\"presentation_detail\"] = (\n", - " row[\"attributes.field_texte_brut_long\"] + service[\"presentation_detail\"]\n", - " )\n", - " service[\"presentation_detail\"] = html_to_markdown(service[\"presentation_detail\"])\n", - " service[\"presentation_detail\"] = service[\"presentation_detail\"] or None\n", - "\n", - " service[\"thematiques\"] = list(\n", - " set(\n", - " [\n", - " v\n", - " for v in [\n", - " DI_THEMATIQUES_BY_AGEFIPH_THEMATIQUE_ID[\n", - " agefiph_thematique_data[\"id\"]\n", - " ]\n", - " for agefiph_thematique_data in row[\n", - " \"relationships.field_thematique.data\"\n", - " ]\n", - " ]\n", - " if v is not None\n", - " ]\n", - " )\n", - " )\n", - "\n", - " return service\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f98dd4d", - "metadata": {}, - "outputs": [], - "source": [ - "template_services_df = raw_services_df.apply(map_service, axis=1, result_type=\"expand\")\n", - "\n", - "template_services_df.info()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0680158", - "metadata": {}, - "outputs": [], - "source": [ - "# Cartesian products\n", - "services_df = (\n", - " raw_structures_df[\n", - " [\n", - " \"id\",\n", - " \"courriel\",\n", - " \"telephone\",\n", - " \"adresse\",\n", - " \"commune\",\n", - " \"code_postal\",\n", - " \"code_insee\",\n", - " ]\n", - " ]\n", - " .rename(columns={\"id\": \"structure_id\"})\n", - " .join(template_services_df, how=\"cross\")\n", - ")\n", - "\n", - "# Making service id unique across all regions\n", - "services_df = services_df.assign(\n", - " id=services_df.apply(\n", - " lambda row: str(\n", - " UUID(\n", - " hex=hashlib.md5((row[\"structure_id\"] + row[\"id\"]).encode()).hexdigest()\n", - " )\n", - " ),\n", - " axis=1,\n", - " )\n", - ")\n", - "\n", - "services_df.info()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ecfc54c4", - "metadata": {}, - "outputs": [], - "source": [ - "services_df.to_json(\"services.json\", orient=\"records\", force_ascii=False)\n", - "raw_structures_df.to_json(\"structures.json\", orient=\"records\", force_ascii=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85c609c1", - "metadata": {}, - "outputs": [], - "source": [ - "services_df\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/cd35/annuaire_social.ipynb b/analyse/notebooks/cd35/annuaire_social.ipynb deleted file mode 100644 index 3a30ade0..00000000 --- a/analyse/notebooks/cd35/annuaire_social.ipynb +++ /dev/null @@ -1,241 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "4cd03fc3", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import dotenv\n", - "import numpy as np\n", - "import pandas as pd\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7211ba2b", - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "pd.options.display.max_columns = None" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebf55fb8", - "metadata": {}, - "outputs": [], - "source": [ - "dotenv.load_dotenv(dotenv.find_dotenv())" - ] - }, - { - "cell_type": "markdown", - "id": "769a1643", - "metadata": {}, - "source": [ - "## état du fichier\n", - "\n", - "* des mauvaises lignes\n", - "* utf-8 mais des éléments non reconnus\n", - "* orienté structures\n", - "* des institutions publiques à priori\n", - "* pas de SIRET, de SIREN ou de code_insee\n", - "* contact plutôt bien renseigné" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa2a2769", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\n", - " os.environ[\"CD35_FILE_URL\"],\n", - " sep=\";\",\n", - " encoding_errors=\"replace\",\n", - " on_bad_lines=\"warn\",\n", - " dtype=str,\n", - ")\n", - "df = df.replace([\"\", np.nan], None)\n", - "df.sample(5)\n" - ] - }, - { - "cell_type": "markdown", - "id": "280f381d", - "metadata": {}, - "source": [ - "#### Nombre de lignes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08a6c77d", - "metadata": {}, - "outputs": [], - "source": [ - "df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "id": "ee752efe", - "metadata": {}, - "source": [ - "#### Taux de remplissage des champs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4da1947c", - "metadata": {}, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99634222", - "metadata": {}, - "outputs": [], - "source": [ - "# taux de remplissage des champs\n", - "compute_field_occupancy_rates(df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "3f0423a8", - "metadata": {}, - "source": [ - "## typologie de structure\n", - "\n", - "via `ORG_SIGLE`\n", - "\n", - "problèmes:\n", - "* présent à 69%\n", - "* beaucoup de valeurs cependant." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28a9ac03", - "metadata": {}, - "outputs": [], - "source": [ - "# les 20 sigles les + fréquents\n", - "df.ORG_SIGLE.value_counts(dropna=False)[:20].to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4fe2559c", - "metadata": {}, - "outputs": [], - "source": [ - "df.ORG_DESC.str.len().median()" - ] - }, - { - "cell_type": "markdown", - "id": "821f0df5", - "metadata": {}, - "source": [ - "### Départements" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6bd60658", - "metadata": {}, - "outputs": [], - "source": [ - "df.ORG_CP.map(lambda x: str(int(x))[:2] if not pd.isna(x) else x).value_counts(dropna=False).to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc983a13", - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_structures_df = df.apply(\n", - " lambda row: {\n", - " \"pmi\": (\"maternelle\" in row.ORG_NOM.lower() and \"infantile\" in row.ORG_NOM.lower()) or \"pmi\" in row.ORG_NOM.lower(),\n", - " \"ccas\": pd.notnull(row.ORG_SIGLE) and \"CCAS\" == row.ORG_SIGLE,\n", - " \"saad\": pd.notnull(row.ORG_SIGLE) and \"SAAD\" == row.ORG_SIGLE,\n", - " \"ehpad\": pd.notnull(row.ORG_SIGLE) and \"EHPAD\" == row.ORG_SIGLE,\n", - " \"epiceries\": \"épiceries\" in row.ORG_NOM.lower(),\n", - " \"admr\": \"admr\" in row.ORG_NOM.lower(),\n", - " \"cmp\": \"cmpp\" in row.ORG_NOM.lower() or (\"centre\" in row.ORG_NOM.lower() and \"psycho\" in row.ORG_NOM.lower()),\n", - " \"espace_jeux\": \"espace\" in row.ORG_NOM.lower() and \"jeu\" in row.ORG_NOM.lower(),\n", - " \"halte_garderie\": \"halte-garderie\" in row.ORG_NOM.lower(),\n", - " \"mairie\": \"mairie\" in row.ORG_NOM.lower(),\n", - " \"pae\": \"pae\" in row.ORG_NOM.lower(),\n", - " \"crèche\": \"crèche\" in row.ORG_NOM.lower(),\n", - " \"sessad\": \"sessad\" in row.ORG_NOM.lower(),\n", - " },\n", - " axis=\"columns\",\n", - " result_type=\"expand\",\n", - ")\n", - "\n", - "df[~categories_flags_structures_df.any(axis=\"columns\")].sort_values(\"ORG_NOM\").sample(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c7b869e", - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_structures_df.sum().sort_values(ascending=False).plot(\n", - " kind=\"bar\", grid=True, rot=35, figsize=(20, 8)\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4 (main, Apr 24 2022, 15:44:04) [GCC 11.2.0]" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/cd62/analyse-cd62.ipynb b/analyse/notebooks/cd62/analyse-cd62.ipynb deleted file mode 100644 index 89c5e04f..00000000 --- a/analyse/notebooks/cd62/analyse-cd62.ipynb +++ /dev/null @@ -1,86 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "b0d545c5", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import dotenv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9dabce51", - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "pd.options.display.max_columns = None\n", - "plt.rc(\"figure\", figsize=[12, 8])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "84ce3602", - "metadata": {}, - "outputs": [], - "source": [ - "dotenv.load_dotenv(dotenv.find_dotenv())\n", - "\n", - "df = pd.read_csv(\n", - " os.environ[\"CD62_FILE_URL\"],\n", - " sep=\";\",\n", - " dtype=str,\n", - " encoding=\"latin\",\n", - ")\n", - "\n", - "df = df.replace([np.nan, \"\"], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d7099dd", - "metadata": {}, - "outputs": [], - "source": [ - "df.sample(5)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/cd72/analyse.ipynb b/analyse/notebooks/cd72/analyse.ipynb deleted file mode 100644 index 51ff9c71..00000000 --- a/analyse/notebooks/cd72/analyse.ipynb +++ /dev/null @@ -1,650 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import dotenv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "pd.options.display.max_columns = None\n", - "plt.rc(\"figure\", figsize=[12, 8])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dotenv.load_dotenv(dotenv.find_dotenv())\n", - "\n", - "df = pd.read_excel(\n", - " os.environ[\"CD72_FILE_URL\"], sheet_name=\"Structures\", dtype=str\n", - ").replace([np.nan, \"\"], None)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.sample(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"ID Structure\"].isna()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"ID Structure\"].nunique()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### siret" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"SIRET\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"SIRET\"].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"SIRET\"].sample(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"SIRET\"].value_counts().head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"SIRET\"].duplicated(keep=False) & df[\"SIRET\"].notna()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Nom Structure\"].sample(20).to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Nom Structure\"].duplicated().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Nom Structure\"].isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### commune" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Ville\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"Ville\"].map(lambda s: \"cedex\" in s.lower() if s is not None else False)][\"Ville\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### code_postal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Code postal\"].value_counts().head(10).to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Code postal\"].sample(20).to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"Code postal\"].isna()][[\"Adresse\", \"Code postal\", \"Ville\", \"Nom Structure\"]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Quelques structures sans données adresses" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### code_insee" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "à géocoder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Adresse\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Adresse\"].sample(20).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### complement_adresse" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### longitude" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### latitude" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "absents" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### typologie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Type de structure\"].value_counts().to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Typologie structure\"].value_counts().to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"Typologie structure\"].isna()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.schema import models\n", - "\n", - "di_typologie_by_input_typologie = {\n", - " \"Associations\": models.Typologie.ASSO,\n", - " \"Autre\": models.Typologie.Autre,\n", - " \"Organisme de formations\": models.Typologie.OF,\n", - " \"Structures porteuses d’ateliers et chantiers d’insertion (ACI)\": models.Typologie.ACI,\n", - " \"Pôle emploi\": models.Typologie.PE,\n", - " \"Centre social\": models.Typologie.CS,\n", - " \"Centres communaux d’action sociale (CCAS)\": models.Typologie.CCAS,\n", - " \"Communautés de Commune\": models.Typologie.CC,\n", - " \"Groupements d'employeurs pour l'insertion et la qualification (GEIQ)\": models.Typologie.GEIQ,\n", - " \"Municipalités\": models.Typologie.MUNI,\n", - " \"Entreprise d'insertion (EI)\": models.Typologie.EI,\n", - " \"Associations intermédiaires (AI)\": models.Typologie.AI,\n", - " \"Maison de quartier\": models.Typologie.MQ,\n", - " \"Mission Locale\": models.Typologie.ML,\n", - " \"Maison des jeunes et de la culture\": models.Typologie.MJC,\n", - " \"Résidence sociale / FJT - Foyer de Jeunes Travailleurs\": models.Typologie.RS_FJT,\n", - " \"Entreprise de travail temporaire d'insertion (ETTI)\": models.Typologie.ETTI,\n", - " \"Points et bureaux information jeunesse (PIJ/BIJ)\": models.Typologie.PIJ_BIJ,\n", - " \"Chambres consulaires (CCI, CMA, CA)\": models.Typologie.Autre,\n", - " \"Directions de l’Economie, de l’Emploi, du Travail et des Solidarités (DEETS)\": models.Typologie.DEETS,\n", - " \"Plans locaux pour l’insertion et l’emploi (PLIE)\": models.Typologie.PLIE,\n", - " \"Bibliothèque / Médiathèque\": models.Typologie.BIB,\n", - " \"Centres d’information sur les droits des femmes et des familles (CIDFF)\": models.Typologie.CIDFF,\n", - " \"Conseils Départementaux (CD)\": models.Typologie.CD,\n", - " \"Caisses d’allocation familiale (CAF)\": models.Typologie.CAF,\n", - " \"Agence nationale pour la formation professionnelle des adultes (AFPA)\": models.Typologie.AFPA,\n", - " \"Préfecture, Sous-Préfecture\": models.Typologie.PREF,\n", - " \"Région\": models.Typologie.REG,\n", - " \"Services pénitentiaires d’insertion et de probation (SPIP)\": models.Typologie.SPIP,\n", - " \"Union Départementale d’Aide aux Familles (UDAF)\": models.Typologie.UDAF,\n", - " None: models.Typologie.Autre,\n", - " }\n", - "\n", - "df[\"Typologie structure\"].map(di_typologie_by_input_typologie).value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### telephone" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Téléphone accueil\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"Téléphone accueil\"].isna() & df[\"Téléphone principal\"].notna()][[\"Téléphone principal\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[[\"Téléphone accueil\", \"Téléphone principal\"]].sample(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Prendre en prio l'accueil puis le principal" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### courriel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"E-mail accueil\"].notna()][\"E-mail accueil\"].sample(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"E-mail accueil\"].isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### site_web" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Site Internet\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"Site Internet\"].notna()][\"Site Internet\"].sample(10).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### presentation_resume" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"Description\"].notna()][\"Description\"].map(lambda s: len(s) > 280).sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### presentation_detail" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### date_maj" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Mis à jour le :\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dateutil.parser import parse\n", - "\n", - "df[\"Mis à jour le :\"].map(lambda s: parse(s) if s is not None else None).hist(bins=100)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### structure_parente" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### lien_source" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### horaires_ouverture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"Horaires\"].notna()][\"Horaires\"].to_frame().sample(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### accessibilite" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### labels_nationaux" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### labels_autres" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### thematiques" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "SIRETs dupliqués ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"SIRET\"].value_counts().head(10).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "taux de remplissage des champs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "compute_field_occupancy_rates(df).to_frame()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.4 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4 (main, Apr 24 2022, 15:44:04) [GCC 11.2.0]" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/cd93/analyse.ipynb b/analyse/notebooks/cd93/analyse.ipynb deleted file mode 100644 index 397ce0fe..00000000 --- a/analyse/notebooks/cd93/analyse.ipynb +++ /dev/null @@ -1,754 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import dotenv\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "pd.options.display.max_columns = None\n", - "plt.rc(\"figure\", figsize=[12, 8])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dotenv.load_dotenv(dotenv.find_dotenv())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df = pd.read_csv(\n", - " os.environ[\"CD93_FILE_URL\"],\n", - " sep=\";\",\n", - " encoding=\"latin1\",\n", - " encoding_errors=\"replace\",\n", - " on_bad_lines=\"warn\",\n", - " dtype=str,\n", - ")\n", - "raw_df = raw_df.replace([np.nan, \"\"], None)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.sample(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "compute_field_occupancy_rates(raw_df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Structures" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Numéro de convention\"].value_counts().to_frame().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Numéro de convention\"].isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* attention : champ libre" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### siret" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"N° SIRET\"].to_frame().sample(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"N° SIRET\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"N° SIRET\"].value_counts().to_frame().head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"N° SIRET\"].value_counts().to_frame(name=\"# de duplications\").hist(bins=100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### rna" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Porteur de projet\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Porteur de projet\"].sample(10).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### commune" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Commune du lieu de l'action\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Commune du lieu de l'action\"].sample(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### code_postal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"CP du lieu de l'action\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"CP du lieu de l'action\"].sample(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### code_insee" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Adresse principale du lieu de l'action\"].sample(10).to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Adresse principale du lieu de l'action\"].isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### complement_adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### longitude" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### latitude" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### typologie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### telephone" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### courriel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Email de la personne à contacter pour le suivi de l'action\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Email de la personne à contacter pour le suivi de l'action\"].sample(10).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### site_web" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Site Web du porteur de projet\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Site Web du porteur de projet\"].sample(10).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### presentation_resume" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[[\"Résumé\", \"Intitulé de l'action\"]].sample(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### presentation_detail" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### source" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### date_maj" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### structure_parente" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### lien_source" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### horaires_ouverture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### accessibilite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### labels_nationaux" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### labels_autres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### thematiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[\"Type d'action\"].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Services" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### presentation_resume" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### thematiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### prise_rdv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### frais" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### frais_autres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_df = pd.DataFrame()\n", - "output_df = output_df.assign(siret=raw_df[\"N° SIRET\"])\n", - "output_df = output_df.assign(nom=raw_df[\"Porteur de projet\"])\n", - "output_df = output_df.assign(adresse=raw_df[\"Adresse principale du lieu de l'action\"])\n", - "output_df = output_df.assign(code_postal=raw_df[\"CP du lieu de l'action\"])\n", - "output_df = output_df.assign(commune=raw_df[\"Commune du lieu de l'action\"])\n", - "output_df = output_df.assign(\n", - " courriel=raw_df[\"Email de la personne à contacter pour le suivi de l'action\"]\n", - ")\n", - "output_df = output_df.assign(lien_source=raw_df[\"Site Web du porteur de projet\"])\n", - "output_df = output_df.assign(typologie=raw_df[\"Type d'action\"])\n", - "output_df = output_df.assign(presentation_detail=raw_df[\"Résumé\"])\n", - "\n", - "output_df.to_json(\"cd93.json\", orient=\"records\", force_ascii=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.4 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4 (main, Apr 24 2022, 15:44:04) [GCC 11.2.0]" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/documentation/sources.ipynb b/analyse/notebooks/documentation/sources.ipynb deleted file mode 100644 index f008f561..00000000 --- a/analyse/notebooks/documentation/sources.ipynb +++ /dev/null @@ -1,93 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "import jinja2\n", - "import numpy as np\n", - "import pandas as pd\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Purpose\n", - "\n", - "This notebook is used to generate documentation about our sources." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# download data from grist\n", - "BASE_URL = \"https://grist.incubateur.anct.gouv.fr/o/datainclusion/api/docs/vho3ZujRYH5vnYf2bvqTNA/download/csv\"\n", - "\n", - "sources_df = pd.read_csv(BASE_URL + \"?tableId=Sources\")\n", - "ressources_df = pd.read_csv(BASE_URL + \"?tableId=Ressources\")\n", - "\n", - "DIR = Path(\".\")\n", - "TEMPLATE_PATH = DIR / \"sources.md\"\n", - "OUTPUT_PATH = DIR / \"build\" / \"sources.md\"\n", - "OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", - "def process_grist_data(df: pd.DataFrame) -> [dict]:\n", - " df = df.replace([np.nan], '').replace([True],'✅').replace([False],'🚫')\n", - " if 'lien_donnees_producteurs' in df.columns:\n", - " df['lien_donnees_producteurs'] = df['lien_donnees_producteurs'].apply(lambda x: f\"[lien]({x})\" if len(x) > 0 else \"\")\n", - " return df.to_dict(orient=\"records\")\n", - "\n", - "# prepare template context\n", - "source_dict_list = process_grist_data(sources_df)\n", - "ressource_dict_list = process_grist_data(ressources_df)\n", - "for source_dict in source_dict_list:\n", - " source_dict[\"ressources\"] = [\n", - " ressource_dict\n", - " for ressource_dict in ressource_dict_list\n", - " if ressource_dict[\"source\"] == source_dict[\"id\"]\n", - " ]\n", - "\n", - "# render template\n", - "template_loader = jinja2.FileSystemLoader(searchpath=DIR)\n", - "template_environment = jinja2.Environment(loader=template_loader)\n", - "template = template_environment.get_template(str(TEMPLATE_PATH))\n", - "with (OUTPUT_PATH).open(\"w\") as file:\n", - " file.write(template.render(sources=source_dict_list))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "vscode": { - "interpreter": { - "hash": "0a605734d949a2010a20d990a303989c986e5cf043e82395c74cff7e62d85c33" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/documentation/sources.md b/analyse/notebooks/documentation/sources.md deleted file mode 100644 index cc32a5ac..00000000 --- a/analyse/notebooks/documentation/sources.md +++ /dev/null @@ -1,37 +0,0 @@ -# sources - -{% for source in sources %} - -## {{ source.nom }} - -{{ source.description|default('*Pas encore de description*', true) }} - -| | | -| ------------------------------------------------------------ | ------------------------------------------------------------------------ | -| Type d'usagers accompagnés | {{ source.type_usager }} | -| Lien vers le site/outil/portail | {% if source.lien_source | length %}[lien]({{ source.lien_source }}){% endif %} | -| Thématiques principales abordées | {{ source.thematiques | default('', true) }} | -| Lien vers les statistiques publiques de la source | {% if source.lien_stats_publiques | length %}[lien]({{ source.lien_stats_publiques }}){% endif %} | -| Identifiant de la source dans les données (colonne `source`) | `{{ source.id }}` | -| Date/fréquence de récupération par data.inclusion | {{ source.frequence_recuperation }} {{ source.date_derniere_recuperation | default('', true) }} | - -{% for ressource in source.ressources %} - -#### {{ source.nom }} : {{ ressource.id }} - -{{ ressource.description }} - -| | | -| ------------------------------------------------------------------ | ------------------------------------------ | -| Lien vers les données d'origine | {{ ressource.lien_donnees_producteurs }} | -| Types de données | {{ ressource.types_donnees }} | -| Ces données sont disponibles dans [l'API data.inclusion](https://www.data.inclusion.beta.gouv.fr/api/lapi-data-inclusion) | {{ ressource.api }} | -| Ces données sont disponibles en [Open Data](https://www.data.inclusion.beta.gouv.fr/open-data/acceder-aux-donnees) | {{ ressource.open_data }} | -| Ces données sont disponibles dans [notre outil de siretisation](https://www.data.inclusion.beta.gouv.fr/schemas-de-donnees-de-loffre/siretisation) | {{ ressource.siretisation_automatisable }} | -| Ces données sont disponibles dans notre outil de correspondance | {{ ressource.correspondances }} | -| Nous historicisons les données d'origine | {{ ressource.historisation }} | - -{% endfor %} - -_______ -{% endfor %} \ No newline at end of file diff --git a/analyse/notebooks/etablissements-publics/analyse.ipynb b/analyse/notebooks/etablissements-publics/analyse.ipynb index 74703340..66fc1952 100644 --- a/analyse/notebooks/etablissements-publics/analyse.ipynb +++ b/analyse/notebooks/etablissements-publics/analyse.ipynb @@ -1591,7 +1591,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.11.6" }, "toc": { "base_numbering": 1, diff --git a/analyse/notebooks/finess/analyse.ipynb b/analyse/notebooks/finess/analyse.ipynb deleted file mode 100644 index 639bf6b7..00000000 --- a/analyse/notebooks/finess/analyse.ipynb +++ /dev/null @@ -1,738 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "57314959", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:38.017194Z", - "start_time": "2022-09-13T18:47:36.833545Z" - } - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from pyproj import Transformer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55b292c0", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:38.020842Z", - "start_time": "2022-09-13T18:47:38.018336Z" - } - }, - "outputs": [], - "source": [ - "pd.options.display.max_columns = None\n", - "plt.rc(\"figure\", figsize=[12, 4])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99912d1c", - "metadata": {}, - "outputs": [], - "source": [ - "import dotenv\n", - "\n", - "dotenv.load_dotenv(dotenv.find_dotenv())\n" - ] - }, - { - "cell_type": "markdown", - "id": "4cf1f60c", - "metadata": {}, - "source": [ - "## Analyse" - ] - }, - { - "cell_type": "markdown", - "id": "28a6c754", - "metadata": {}, - "source": [ - "👉 RÉSUMÉ EN FIN DE PAGE" - ] - }, - { - "cell_type": "markdown", - "id": "bfb774c5", - "metadata": {}, - "source": [ - "Documentation du fichier : https://www.data.gouv.fr/fr/datasets/r/d06a0924-9931-4a60-83b6-93abdb6acfd6" - ] - }, - { - "cell_type": "markdown", - "id": "90de2472", - "metadata": {}, - "source": [ - "### Importation des données bruts" - ] - }, - { - "cell_type": "markdown", - "id": "04f3494a", - "metadata": {}, - "source": [ - "Le fichier utilisé n'est pas le fichier brut produit par FINESS (https://www.data.gouv.fr/fr/datasets/finess-extraction-du-fichier-des-etablissements/), mais une version pré-nettoyée via ce [script](https://github.com/taniki/notebooks/blob/master/finess/clean.ipynb)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85953194", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:38.583316Z", - "start_time": "2022-09-13T18:47:38.022775Z" - } - }, - "outputs": [], - "source": [ - "raw_df = pd.read_csv(\n", - " os.environ.get(\"FINESS_FILE_URL\"),\n", - " sep=\",\",\n", - " index_col=0, # la 1ère ligne du fichier est le numéro de ligne\n", - " on_bad_lines=\"warn\",\n", - " dtype=str,\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "432e8ac3", - "metadata": {}, - "source": [ - "### Aperçu" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be5f3319", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:38.608199Z", - "start_time": "2022-09-13T18:47:38.586253Z" - } - }, - "outputs": [], - "source": [ - "raw_df.sample(2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0841a538", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75327f3e", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:38.630558Z", - "start_time": "2022-09-13T18:47:38.611014Z" - } - }, - "outputs": [], - "source": [ - "# Le fichier contient des catégories d'établissements.\n", - "# Seul un sous-ensemble de ces catégories nous intéresse.\n", - "# cf https://www.notion.so/dora-beta/Analyse-des-donn-es-FINESS-75b23111f35a4057a97ff4e2bb1fa78f\n", - "\n", - "raw_df = raw_df[\n", - " raw_df[\"categagretab\"].isin(['4301', '4302', '4303', '4501', '4601', '4602', '4607'])\n", - " | (raw_df[\"categagretab\"] == \"2202\") & (raw_df[\"categetab\"] == \"228\")\n", - " | (raw_df[\"categagretab\"] == \"2206\") & (raw_df[\"categetab\"] == \"636\")\n", - "]\n", - "\n", - "raw_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9524192", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:38.634670Z", - "start_time": "2022-09-13T18:47:38.631808Z" - } - }, - "outputs": [], - "source": [ - "# Labels sont issus de la documentation FINESS\n", - "# cf https://www.notion.so/dora-beta/Analyse-des-donn-es-FINESS-75b23111f35a4057a97ff4e2bb1fa78f\n", - "\n", - "categories = {\n", - " '4301': \"Etab. et Services d'Hébergement pour Adultes Handicapés\",\n", - " '4302': \"Services de Travail Protégé pour Adultes Handicapés\",\n", - " '4303': \"Services de Réinsertion Prof pour Adultes Handicapés\",\n", - " '4501': \"Etablissements de l'Aide Sociale à l'Enfance\",\n", - " '4601': \"Etablissements pour Adultes et Familles en Difficulté\",\n", - " '4602': \"Autres Etablissements Sociaux d'Hébergement et d'Accueil\",\n", - " '4607': \"Logements en Structure Collective\",\n", - "}\n", - "\n", - "sub_categories = {\n", - " '228': \"Centre Planification ou Education Familiale Ctre.Planif.Educ.Fam\",\n", - " '636': \"Centre de soins et de prévention Centre soins prév.\",\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "56c6c622", - "metadata": {}, - "source": [ - "### nettoyage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "086049a6", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df = raw_df.replace([\"\", np.nan], None)" - ] - }, - { - "cell_type": "markdown", - "id": "17139648", - "metadata": {}, - "source": [ - "### Taux de remplissage des champs de structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "555b1d31", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:38.701781Z", - "start_time": "2022-09-13T18:47:38.638269Z" - } - }, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)\n", - "\n", - "compute_field_occupancy_rates(raw_df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "91e5c329", - "metadata": {}, - "source": [ - "### Identifiant local ?" - ] - }, - { - "cell_type": "markdown", - "id": "e4c90b2b", - "metadata": {}, - "source": [ - "2 champs potentiels : `nofinesset` et `nofinessej`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72213154", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.nofinessej.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a724f11c", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.nofinesset.nunique()" - ] - }, - { - "cell_type": "markdown", - "id": "0833a8f7", - "metadata": {}, - "source": [ - "conclusion : `nofinessej` dupliqué et `nofinesset` unique -> `nofinesset` identifiant local" - ] - }, - { - "cell_type": "markdown", - "id": "ff68e8d8", - "metadata": {}, - "source": [ - "### Code insee ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba137a67", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.assign(foo=raw_df.ligneacheminement.str.extract(r\"\\d{5} (.*?)(?= CEDEX|$)\"))[[\"foo\", \"ligneacheminement\"]][raw_df.ligneacheminement.str.contains(\"CEDEX\")]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5ea54cb", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[[\"departement\", \"commune\", \"ligneacheminement\"]].sample(5)\n", - "raw_df[raw_df.nofinesset == \"2A0001269\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1714ece0", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[raw_df.nofinesset == \"970407573\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a162c51a", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df[raw_df.nofinesset == \"970407573\"].iloc[0].to_json(force_ascii=False)" - ] - }, - { - "cell_type": "markdown", - "id": "3547d96b", - "metadata": {}, - "source": [ - "conclusion: departement + commune = code_insee" - ] - }, - { - "cell_type": "markdown", - "id": "ad71b128", - "metadata": {}, - "source": [ - "### Date de màj ?" - ] - }, - { - "cell_type": "markdown", - "id": "9bb33d6d", - "metadata": {}, - "source": [ - "2 champs potentiels : `maj` et `datemaj`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70e0bc40", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.datemaj.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cf5008f8", - "metadata": {}, - "outputs": [], - "source": [ - "plt.rc(\"figure\", figsize=[12, 4])\n", - "sns.set(style=\"darkgrid\")\n", - "sns.histplot(data=raw_df.maj.apply(pd.to_datetime, errors=\"coerce\"), bins=20)" - ] - }, - { - "cell_type": "markdown", - "id": "c51f683e", - "metadata": {}, - "source": [ - "conclusion:\n", - "* `datemaj`: la même date `2022-07-04` --> pas super fiable...\n", - "* `maj`: plus intéressant, mais quelle diff entre les 2 champs ?" - ] - }, - { - "cell_type": "markdown", - "id": "1131c043", - "metadata": {}, - "source": [ - "### SIRET ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "616ce9e5", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.siret.value_counts().head(10).to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0933da59", - "metadata": {}, - "outputs": [], - "source": [ - "# \"31723624800017\" -> Emmaüs\n", - "# \"77568030900611\" -> Coallia\n", - "# \"78805803000016\" -> Adoma\n", - "# ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aba99338", - "metadata": {}, - "outputs": [], - "source": [ - "raw_df.siret.drop_duplicates(keep=False).count()" - ] - }, - { - "cell_type": "markdown", - "id": "cdabfa1f", - "metadata": {}, - "source": [ - "* Pas mal de réseaux et d'antennes\n", - "* 9363 structures uniques" - ] - }, - { - "cell_type": "markdown", - "id": "1bf4f9b1", - "metadata": {}, - "source": [ - "### Nettoyage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "270a1bbc", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:38.895389Z", - "start_time": "2022-09-13T18:47:38.708325Z" - } - }, - "outputs": [], - "source": [ - "df = raw_df.copy()\n", - "\n", - "# Traitement des dates\n", - "df[[\"dateouv\", \"dateautor\", \"maj\", \"datemaj\"]] = df[\n", - " [\"dateouv\", \"dateautor\", \"maj\", \"datemaj\"]\n", - "].apply(pd.to_datetime, errors=\"coerce\")\n", - "\n", - "# Reconstruction des adresses\n", - "df[\"adresse\"] = (\n", - " df.numvoie.fillna(0.0).astype(int).astype(str).replace(0, None)\n", - " + \" \"\n", - " + df.typvoie\n", - " + \" \"\n", - " + df.voie\n", - ")\n", - "\n", - "# Dissocier les codes postaux des noms de ville\n", - "df[[\"cp\", \"commune\"]] = df.ligneacheminement.str.split(\" \", 1, expand=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de217a64", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:39.020715Z", - "start_time": "2022-09-13T18:47:38.896725Z" - } - }, - "outputs": [], - "source": [ - "# Conversion des coordonnées du système géodésique local vers WGS84\n", - "EPSG = {\n", - " # Default value : 2154,\n", - " 'GUADELOUPE': 2970,\n", - " 'MARTINIQUE': 2970,\n", - " 'GUYANE': 2972,\n", - " 'LA REUNION': 2975,\n", - " 'MAYOTTE': 4471,\n", - " 'SAINT PIERRE ET MIQUELON': 4467\n", - "}\n", - "\n", - "def coordinates_to_wgs84(df, from_epsg):\n", - " transformer = Transformer.from_crs(\"epsg:\" + str(from_epsg), \"epsg:4326\", always_xy=True)\n", - " latitude, longitude = transformer.transform(df['coordxet'], df['coordyet'])\n", - " return latitude, longitude\n", - "\n", - "# Transformer les coordonnées de la métropole\n", - "wgs84 = coordinates_to_wgs84(df, 2154)\n", - "mask = ~df.libdepartement.isin(EPSG.values())\n", - "df.loc[mask, 'longitude'] = wgs84[0][mask]\n", - "df.loc[mask, 'latitude'] = wgs84[1][mask]\n", - "\n", - "# Transformer les coordonnées des Territoires d'Outre-Mer\n", - "for location, code in EPSG.items():\n", - " wgs84 = coordinates_to_wgs84(df, code)\n", - " mask = df.libdepartement == location\n", - " df.loc[mask, 'longitude'] = wgs84[0][mask]\n", - " df.loc[mask, 'latitude'] = wgs84[1][mask]" - ] - }, - { - "cell_type": "markdown", - "id": "572ad45f", - "metadata": {}, - "source": [ - "# Analyse des typologies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e44fc80c", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:53.264480Z", - "start_time": "2022-09-13T18:47:53.244886Z" - } - }, - "outputs": [], - "source": [ - "from data_inclusion.schema import models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e95565b", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:55.072006Z", - "start_time": "2022-09-13T18:47:53.921721Z" - } - }, - "outputs": [], - "source": [ - "# Recherche des typologies remarquables\n", - "for typ in models.Typologie:\n", - " if df['rs'].apply(lambda s: typ.value.lower() in s.lower().split()).any():\n", - " print('')\n", - " print(typ.value)\n", - " print(df[df['rs'].apply(lambda s: typ.value.lower() in s.lower().split())].sample(1).rs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd397c29", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:43.047716Z", - "start_time": "2022-09-13T18:47:40.302788Z" - } - }, - "outputs": [], - "source": [ - "categories_flags_places_df = (\n", - " df.rs.str.lower()\n", - " .apply(\n", - " lambda s: {\n", - " \"ase\": \"ase\" in s.split(),\n", - " \"association\": \"association\" in s.split() or \"asso\" in s.split(),\n", - " \"cada\": \"cada\" in s.split(),\n", - " \"cava\": \"cava\" in s.split(),\n", - " \"ccas\": \"ccas\" in s.split(),\n", - " \"chrs\": \"chrs\" in s.split(),\n", - " \"chu\": \"chu\" in s.split(),\n", - " \"cias\": \"cias\" in s.split(),\n", - " \"cidff\": \"cidff\" in s.split(),\n", - " \"csapa\": \"csapa\" in s.split(),\n", - " \"ea\": \"ea\" in s.split(),\n", - " \"esat\": \"esat\" in s.split(),\n", - " \"huda\": \"huda\" in s.split(),\n", - " \"mde\": \"mde\" in s.split(),\n", - " \"mdef\": \"mdef\" in s.split(),\n", - " \"mjc\": \"mjc\" in s.split(),\n", - " \"msa\": \"msa\" in s.split(),\n", - " \"pension\": \"pension\" in s.split(),\n", - " \"prevention\": \"prevention\" in s.split(),\n", - " \"cph\": \"cph\" in s.split(),\n", - " \"udaf\": \"udaf\" in s.split(),\n", - " },\n", - " )\n", - " .apply(pd.Series)\n", - " .assign(\n", - " na=lambda df: df.apply(\n", - " lambda row: ~row.any(), axis=\"columns\", result_type=\"expand\"\n", - " )\n", - " )\n", - ")\n", - "\n", - "categories_flags_places_df.sum().sort_values(ascending=False).plot(kind=\"bar\", grid=True, rot=35, figsize=(20, 8))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05c87cb7", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:43.395839Z", - "start_time": "2022-09-13T18:47:43.049068Z" - } - }, - "outputs": [], - "source": [ - "plt.rc(\"figure\", figsize=[12, 8])\n", - "sns.countplot(\n", - " data=df.fillna('Inconnu'),\n", - " y='libcategetab',\n", - " order = df['libcategetab'].fillna('Inconnu').value_counts().index\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "24e16f8e", - "metadata": {}, - "source": [ - "### Etude des répartitions géographiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e684ec8", - "metadata": { - "ExecuteTime": { - "end_time": "2022-09-13T18:47:44.101298Z", - "start_time": "2022-09-13T18:47:43.397023Z" - } - }, - "outputs": [], - "source": [ - "plt.rc(\"figure\", figsize=[12, 16])\n", - "sns.countplot(\n", - " data=df.fillna('Inconnu'),\n", - " y='libdepartement',\n", - " order = df['libdepartement'].fillna('Inconnu').value_counts().index\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "6c51abe2", - "metadata": {}, - "source": [ - "## Résumé" - ] - }, - { - "cell_type": "markdown", - "id": "c4af94f3", - "metadata": {}, - "source": [ - "\n", - "\n", - "
✅ Dataset de structures\n", - "
✅ 100% de dates de création et de mises à jour\n", - "
✅ 100% d'identifiants uniques ('nofinessej')\n", - "
✅ 100% de noms\n", - "
✅ 100% de communes\n", - "
✅ Les champs peuvent être majoritairement remplis proprement\n", - "
\n", - "
⚠️ 84% de SIRET\n", - "
⚠️ Voies et noms de rue spécifiques à considérer\n", - "
⚠️ Nombreuses typologies présentes, demande une analyse plus fine\n", - "
\n", - "
❌ Adresses parfois partielles\n", - "
❌ Aucun rna" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.4 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/grist/template.ipynb b/analyse/notebooks/grist/template.ipynb deleted file mode 100644 index 71d160ed..00000000 --- a/analyse/notebooks/grist/template.ipynb +++ /dev/null @@ -1,199 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -e ../../../pipeline\n", - "%pip install -e ../../../../data-inclusion-schema\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import dotenv\n", - "import pandas as pd\n", - "\n", - "from data_inclusion.scripts.tasks import grist\n", - "from data_inclusion import schema" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dotenv.load_dotenv(dotenv.find_dotenv())\n", - "\n", - "GRIST_API_TOKEN = os.environ[\"GRIST_API_TOKEN\"]\n", - "GRIST_API_URL = \"https://grist.incubateur.net/api\"\n", - "WORKSPACE_ID = \"27\"\n", - "DOCUMENT_NAME = \"template\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grist_client = grist.GristClient(base_url=GRIST_API_URL, token=GRIST_API_TOKEN)\n", - "\n", - "document_id = grist_client.create_document(\n", - " workspace_id=WORKSPACE_ID, document_name=DOCUMENT_NAME\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for referentiel in [\n", - " \"frais\",\n", - " \"labels_nationaux\",\n", - " \"modes_accueil\",\n", - " \"modes_orientation_accompagnateur\",\n", - " \"modes_orientation_beneficiaire\",\n", - " \"profils\",\n", - " \"thematiques\",\n", - " \"typologies_de_services\",\n", - " \"typologies_de_structures\",\n", - " \"zones_de_diffusion_types\",\n", - "]:\n", - " table_id = grist_client.create_table(\n", - " document_id=document_id,\n", - " table_name=referentiel.capitalize(),\n", - " columns=[\n", - " {\"id\": \"value\", \"fields\": {\"label\": \"valeur\", \"type\": \"Text\"}},\n", - " {\"id\": \"label\", \"fields\": {\"label\": \"label\", \"type\": \"Text\"}},\n", - " ],\n", - " )\n", - "\n", - " referentiel_df = pd.read_csv(\n", - " f\"../../../pipeline/dbt/seeds/schema/{referentiel}.csv\",\n", - " dtype=str,\n", - " )\n", - "\n", - " # attention: pas idempotent\n", - "\n", - " grist_client.add_records(\n", - " document_id=document_id,\n", - " table_id=table_id,\n", - " records=[\n", - " {\"fields\": value_dict}\n", - " for value_dict in referentiel_df[[\"value\", \"label\"]].to_dict(\n", - " orient=\"records\"\n", - " )\n", - " ],\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "\n", - "\n", - "def get_column_type(field) -> str:\n", - " match_referentiel = re.search(\n", - " r\"data_inclusion.schema.(?P\\w+)\", str(field.annotation)\n", - " )\n", - "\n", - " if match_referentiel is not None:\n", - " return \"Ref:\" + match_referentiel.group(\"referentiel\").capitalize()\n", - " elif \"float\" in str(field.annotation):\n", - " return \"Numeric\"\n", - " elif \"bool\" in str(field.annotation):\n", - " return \"Bool\"\n", - " elif \"date\" in str(field.annotation):\n", - " return \"DateTime:Europe/Paris\"\n", - "\n", - " return \"Text\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grist_columns = [\n", - " {\n", - " \"id\": field_name,\n", - " \"fields\": {\n", - " \"label\": field_name,\n", - " \"type\": get_column_type(field_info),\n", - " # \"visibleCol\": TODO\n", - " },\n", - " }\n", - " for field_name, field_info in schema.Structure.model_fields.items()\n", - "]\n", - "\n", - "grist_client.create_table(\n", - " document_id=document_id,\n", - " table_name=\"Structures\",\n", - " columns=grist_columns,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grist_columns = [\n", - " {\n", - " \"id\": field_name,\n", - " \"fields\": {\n", - " \"label\": field_name,\n", - " \"type\": get_column_type(field_info),\n", - " # \"visibleCol\": TODO\n", - " },\n", - " }\n", - " for field_name, field_info in schema.Service.model_fields.items()\n", - "]\n", - "\n", - "grist_client.create_table(\n", - " document_id=document_id,\n", - " table_name=\"Services\",\n", - " columns=grist_columns,\n", - ")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/mes-aides/garages-solidaires.ipynb b/analyse/notebooks/mes-aides/garages-solidaires.ipynb deleted file mode 100644 index fd22f256..00000000 --- a/analyse/notebooks/mes-aides/garages-solidaires.ipynb +++ /dev/null @@ -1,392 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "f8535e93", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:41:01.600570Z", - "start_time": "2022-08-16T14:41:01.596317Z" - } - }, - "outputs": [], - "source": [ - "# Imports\n", - "import os\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from dotenv import load_dotenv, find_dotenv\n", - "from pyairtable import Table\n", - "\n", - "config = load_dotenv(find_dotenv())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "812cfaba", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:45:52.065382Z", - "start_time": "2022-08-16T14:45:52.062933Z" - } - }, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "plt.rc(\"figure\", figsize=[12, 4])" - ] - }, - { - "cell_type": "markdown", - "id": "61863810", - "metadata": {}, - "source": [ - "### Import des données brutes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a544e8e", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:41:14.366749Z", - "start_time": "2022-08-16T14:41:03.119366Z" - } - }, - "outputs": [], - "source": [ - "# Fetch data from Airtable API\n", - "table = Table(\n", - " api_key=os.getenv(\"API_KEY\"),\n", - " base_id=os.getenv(\"BASE_ID\"),\n", - " table_name=os.getenv(\"TABLE_NAME\")\n", - ")\n", - "\n", - "# Loading into a dataframe\n", - "data = [col['fields'] for col in table.all()]\n", - "raw_df = pd.DataFrame(data)" - ] - }, - { - "cell_type": "markdown", - "id": "f523c2c2", - "metadata": {}, - "source": [ - "### Aperçu" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca0eadde", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:41:14.399016Z", - "start_time": "2022-08-16T14:41:14.371500Z" - } - }, - "outputs": [], - "source": [ - "raw_df.sample(3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5bd7275a", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:41:43.903825Z", - "start_time": "2022-08-16T14:41:43.857636Z" - } - }, - "outputs": [], - "source": [ - "desc_df = raw_df.describe().drop(['top'], axis=0).transpose()\n", - "desc_df.columns = ['Décompte', 'Valeurs uniques', 'Décompte valeur de la plus fréquente']\n", - "desc_df" - ] - }, - { - "cell_type": "markdown", - "id": "3c6a5914", - "metadata": {}, - "source": [ - "### Nettoyage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c980645", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:42:12.417099Z", - "start_time": "2022-08-16T14:42:12.403361Z" - } - }, - "outputs": [], - "source": [ - "# Date column to datetime\n", - "raw_df[['Créé le', 'Modifié le']] = raw_df[['Créé le', 'Modifié le']].apply(pd.to_datetime, errors='coerce')\n", - "raw_df = raw_df.replace([\"\", np.nan], None)" - ] - }, - { - "cell_type": "markdown", - "id": "691bbfc2", - "metadata": {}, - "source": [ - "### Taux de remplissage des champs de structures" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47970d6c", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:43:15.210922Z", - "start_time": "2022-08-16T14:43:15.201914Z" - } - }, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)\n", - "\n", - "compute_field_occupancy_rates(raw_df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "2cc86427", - "metadata": {}, - "source": [ - "### Distribution du premier référencement" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c0d268f", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:46:09.473200Z", - "start_time": "2022-08-16T14:46:09.308306Z" - } - }, - "outputs": [], - "source": [ - "sns.set(style=\"darkgrid\")\n", - "sns.histplot(data=raw_df, x='Créé le', bins=20)" - ] - }, - { - "cell_type": "markdown", - "id": "be694e35", - "metadata": {}, - "source": [ - "### Distribution de la dernière mise à jour" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f011f3a", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:46:11.768033Z", - "start_time": "2022-08-16T14:46:11.519845Z" - } - }, - "outputs": [], - "source": [ - "sns.histplot(data=raw_df, x='Modifié le', bins=10)" - ] - }, - { - "cell_type": "markdown", - "id": "0f1a249b", - "metadata": {}, - "source": [ - "### Typologie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03cb76f3", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:46:14.379763Z", - "start_time": "2022-08-16T14:46:14.230998Z" - } - }, - "outputs": [], - "source": [ - "sns.countplot(\n", - " data=raw_df.fillna('Type inconnu'),\n", - " y='Type',\n", - " order = raw_df['Type'].fillna('Type inconnu').value_counts().index\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "e0a4547a", - "metadata": {}, - "source": [ - "### Répartition géographique" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a5bc08b", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:46:16.243743Z", - "start_time": "2022-08-16T14:46:16.072046Z" - } - }, - "outputs": [], - "source": [ - "sns.countplot(\n", - " data=raw_df.fillna('Inconnu'),\n", - " y='Région Nom',\n", - " order = raw_df['Région Nom'].fillna('Inconnu').value_counts().index\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f07f08c6", - "metadata": {}, - "source": [ - "### Répartition des partenaires" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e1512f4", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T14:46:18.982731Z", - "start_time": "2022-08-16T14:46:18.856057Z" - } - }, - "outputs": [], - "source": [ - "sns.countplot(\n", - " data=raw_df.fillna('Inconnu'),\n", - " y='Partenaire Nom',\n", - " order = raw_df['Partenaire Nom'].fillna('Inconnu').value_counts().index\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b1c8a668", - "metadata": {}, - "source": [ - "### Répartition des services" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36dc05ae", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T16:37:18.375218Z", - "start_time": "2022-08-16T16:37:18.364288Z" - } - }, - "outputs": [], - "source": [ - "def get_unique(df, col):\n", - " unique_values = []\n", - " for el in df[col]:\n", - " if not el is None:\n", - " for sub_el in el:\n", - " if not sub_el in unique_values:\n", - " unique_values.append(sub_el)\n", - " return unique_values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aecd1144", - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-16T16:43:59.771261Z", - "start_time": "2022-08-16T16:43:59.754460Z" - } - }, - "outputs": [], - "source": [ - "print(\"Services : \", get_unique(raw_df, 'Services'))\n", - "print(\"Véhicules traités : \", get_unique(raw_df, 'Types de véhicule'))\n", - "print(\"% sans services indiqués : \",\n", - " raw_df['Services'].isna().sum()/raw_df['Services'].count()*100)\n", - "print(\"% sans véhicules indiqués : \",\n", - " raw_df['Types de véhicule'].isna().sum()/raw_df['Types de véhicule'].count())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "254991d0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - }, - "vscode": { - "interpreter": { - "hash": "792c9d9a8f1724810e690d2630ae6c136f67c36719c7aa6bb230be23f85b2432" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/monenfant/extract.ipynb b/analyse/notebooks/monenfant/extract.ipynb deleted file mode 100644 index e2508153..00000000 --- a/analyse/notebooks/monenfant/extract.ipynb +++ /dev/null @@ -1,173 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Extraction des données monenfant.fr\n", - "\n", - "L'extraction se fait manuellement à partir de la [recherche du site de monenfant.fr](https://monenfant.fr/que-recherchez-vous/).\n", - "\n", - "Il faut :\n", - "\n", - "* ouvrir préalablemet la console du navigateur sur la page \"network\",\n", - "* sélectionner une catégorie (e.g. \"Mode d'accueil\") puis une typologie (e.g. Crêches),\n", - "* remplir le captcha,\n", - "* indiquer une ville et augmenter le rayon de recherche (30km),\n", - "* lancer la recherche,\n", - "* dans la console du navigateur, repérer l'appel à la ressource `que-recherchez-vous` qui contient les résultats au format json (généralement le dernier et le plus gros),\n", - "* sauvegarder la réponse au format json dans le dossier `data` de ce dossier.\n", - "\n", - "La recherche :\n", - "\n", - "* se fait autour d'une ville, dans un rayon de 30km maximum,\n", - "* est limitée à 2000 résultats.\n", - "\n", - "Donc, pour récupérer les données pour un département:\n", - "\n", - "* faire plusieurs recherches pour des villes qui permettent de recouvrir le département,\n", - "* fusionner les résultats,\n", - "* filtrer les résultats pour exclure les données qui ne sont pas dans le département visé.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "| département | villes à rechercher |\n", - "|--------------|----------------------------------------------|\n", - "| Essonne | Etampes, Arpajon |\n", - "| Aude | Bram, Lagrasse, Peyriac-de-Mer, Quillan |\n", - "| Drôme | Peyrins, Bouvante, Crest, Taulignan, Séderon |\n", - "| Haute-Savoie | Annecy, Onnion, Passy |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "from pathlib import Path\n", - "\n", - "import pandas as pd\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import time\n", - "from datetime import datetime\n", - "\n", - "now = datetime.now()\n", - "today = now.strftime(\"%d/%m/%Y\")\n", - "\n", - "\n", - "def get_creche(row) -> dict:\n", - " url = \"https://monenfant.fr/web/guest/que-recherchez-vous?p_p_id=fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=%2Frecherche%2Frechercher&p_p_cacheability=cacheLevelPage&_fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM_cmd=get_structure_details\"\n", - "\n", - " headers = {\n", - " \"User-Agent\": \"data.inclusion@beta.gouv.fr\",\n", - " \"Accept\": \"application/json, text/javascript, */*; q=0.01\",\n", - " \"Accept-Language\": \"fr,en-US;q=0.7,en;q=0.3\",\n", - " \"Content-Type\": \"application/x-www-form-urlencoded; charset=UTF-8\",\n", - " \"X-Requested-With\": \"XMLHttpRequest\",\n", - " \"Origin\": \"https://monenfant.fr\",\n", - " \"DNT\": \"1\",\n", - " \"Connection\": \"keep-alive\",\n", - " \"Referer\": \"https://monenfant.fr/que-recherchez-vous/mode-d-accueil\",\n", - " \"Sec-Fetch-Dest\": \"empty\",\n", - " \"Sec-Fetch-Mode\": \"cors\",\n", - " \"Sec-Fetch-Site\": \"same-origin\",\n", - " \"Sec-GPC\": \"1\",\n", - " }\n", - "\n", - " data = {\n", - " \"_fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM_id\": row.resultId,\n", - " \"_fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM_dureeRecherche\": \"345\",\n", - " \"_fr_monenfant_recherche_portlet_RecherchePortlet_INSTANCE_VnedXuapLnSM_dateDebutRecherche\": today,\n", - " }\n", - "\n", - " time.sleep(0.4)\n", - "\n", - " return requests.post(url, headers=headers, data=data).json()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.concat(\n", - " [\n", - " pd.DataFrame(json.load(file.open())[\"results\"])\n", - " for file in Path(\"./data\").glob(\"*.json\")\n", - " ],\n", - " ignore_index=True,\n", - ")\n", - "\n", - "df = df[\n", - " df.adresse.str.contains(r\"91\\d{3}\") # Essonne\n", - " | df.adresse.str.contains(r\"11\\d{3}\") # Aude\n", - " | df.adresse.str.contains(r\"74\\d{3}\") # Drôme\n", - " | df.adresse.str.contains(r\"26\\d{3}\") # Haute-Savoie\n", - "]\n", - "\n", - "df = df.drop_duplicates(subset=[\"resultId\"])\n", - "\n", - "df.shape[0]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.apply(get_creche, axis=1, result_type=\"expand\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_json(\n", - " \"creches.json\",\n", - " orient=\"records\",\n", - " force_ascii=False,\n", - ")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/odspep/analyse-novembre.ipynb b/analyse/notebooks/odspep/analyse-novembre.ipynb deleted file mode 100644 index 38144ebb..00000000 --- a/analyse/notebooks/odspep/analyse-novembre.ipynb +++ /dev/null @@ -1,664 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import dotenv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "pd.options.display.max_columns = None\n", - "plt.rc(\"figure\", figsize=[12, 8])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dotenv.load_dotenv(dotenv.find_dotenv())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df1 = pd.read_excel(\"~/Downloads/exportDora0311-1.xlsx\", dtype=str).replace([np.nan, \"\"], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "set(df.columns) - set(df1.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df2 = pd.read_excel(\"~/Downloads/exportDora0311-2.xlsx\", dtype=str).replace([np.nan, \"\"], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df3 = pd.read_excel(\"~/Downloads/exportDora0311-3.xlsx\", dtype=str).replace([np.nan, \"\"], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df4 = pd.read_excel(\"~/Downloads/exportDora0311-4.xlsx\", dtype=str).replace([np.nan, \"\"], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.concat([df1, df2, df3, df4])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "compute_field_occupancy_rates(df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Structures" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df = df.iloc[\n", - " :,\n", - " df.columns.str.contains(\"ID_RES|STRUCTURE|LIBELLE_SERVICE|DESCRIPTION_SERVICE\")\n", - " | df.columns.str.endswith(\"_ADR\")\n", - " | df.columns.str.endswith(\"_PHY\")\n", - " | df.columns.str.endswith(\"_CTC\"),\n", - "]\n", - "ressources_df = ressources_df.drop_duplicates(subset=\"ID_RES\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.STRUCTURE.duplicated(keep=False).sum()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### siret" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### rna" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df[[\"STRUCTURE\", \"LIBELLE_SERVICE\"]].sample(20)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### commune" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.LIBELLE_COMMUNE_ADR.isna().sum()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### code_postal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.CODE_POSTAL_ADR.isna().sum()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### code_insee" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.CODE_COMMUNE_ADR.sample(20).to_frame()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.L4_NUMERO_LIB_VOIE_ADR.isna().sum()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### complement_adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### longitude" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### latitude" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### typologie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### telephone" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### courriel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### site_web" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### presentation_resume" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### presentation_detail" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### source" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### date_maj" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### structure_parente" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### lien_source" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### horaires_ouverture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### accessibilite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### labels_nationaux" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### labels_autres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### thematiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Services" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### presentation_resume" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### thematiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### prise_rdv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### frais" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### frais_autres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.4 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/odspep/analyse.ipynb b/analyse/notebooks/odspep/analyse.ipynb deleted file mode 100644 index 7cc28f8f..00000000 --- a/analyse/notebooks/odspep/analyse.ipynb +++ /dev/null @@ -1,1053 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "d409edbf", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from datetime import datetime\n", - "\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "pd.options.display.max_rows = None\n", - "pd.options.display.max_columns = None" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6f66c3a", - "metadata": {}, - "outputs": [], - "source": [ - "import dotenv\n", - "\n", - "dotenv.load_dotenv(dotenv.find_dotenv())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c3ebbe8b", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_excel(os.environ.get(\"ODSPEP_FILE_URL\"), dtype=str).replace(\n", - " [np.nan, \"\"], None\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66852e0a", - "metadata": {}, - "outputs": [], - "source": [ - "df.sample(5)\n" - ] - }, - { - "cell_type": "markdown", - "id": "c8fb0c89", - "metadata": {}, - "source": [ - "### Description générale" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "139baacf", - "metadata": {}, - "outputs": [], - "source": [ - "df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "id": "26798578", - "metadata": {}, - "source": [ - "### analyse\n", - "\n", - "* orienté services\n", - "* 696 services dans le fichier en théorie\n", - "* dénormalisation des autres dimensions (mise à plat) -> `ID_RES` dupliqués\n", - "* jointure pour obtenir les \"libellés\" à partir des codes\n", - "* ok : lat/lon, adresse sous norme AFNOR, code insee\n", - "* ok : horaires d'ouverture au format maison\n", - "* ok : contacts de la structure (champs *_CTC), tel très remplis\n", - "* ko : pas de siret\n", - "* ko : pas de date de maj\n", - "* ko : pas de champs pour mapper la typologie de structure\n", - "\n", - "### documentation\n", - "\n", - "* les champs `*_ADR` correspondent à l'adresse de la structure (et non du service)\n", - "* pour obtenir la zone de diffusion du service, regrouper le champs `COM_SERVICE` pour un même `ID_RES`\n" - ] - }, - { - "cell_type": "markdown", - "id": "118e69ef", - "metadata": {}, - "source": [ - "### Répartition géographique ?\n", - "\n", - "* à prendre avec des pincettes car le fichier est dénormalisé\n", - "* d'autres départements que ceux initialement demandés sont présent car un service dans le cd35 par exemple peut avoir une zone de diffusion plus grande que le département." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3298550b", - "metadata": {}, - "outputs": [], - "source": [ - "df[[\"DEPT_SERVICE\", \"Departement Service\"]].value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "52167c09", - "metadata": {}, - "source": [ - "## Structures" - ] - }, - { - "cell_type": "markdown", - "id": "ce4a47cb", - "metadata": {}, - "source": [ - "### id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e246d2a1", - "metadata": {}, - "outputs": [], - "source": [ - "df.shape[0]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4ffb934", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df = df.iloc[\n", - " :,\n", - " (df.columns == \"ID_RES\")\n", - " | (df.columns == \"STRUCTURE\")\n", - " | (df.columns == \"LIBELLE_SERVICE\")\n", - " | (df.columns == \"DESCRIPTION_SERVICE\")\n", - " | (df.columns == \"DATE DERNIERE MAJ\")\n", - " | df.columns.str.endswith(\"_ADR\")\n", - " | df.columns.str.endswith(\"_PHY\")\n", - " | df.columns.str.endswith(\"_CTC\"),\n", - "]\n", - "ressources_df = ressources_df.drop_duplicates(subset=\"ID_RES\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d6362b5", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.info()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1528a216", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.shape[0]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fc1b1b8", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.STRUCTURE.duplicated(keep=False).sum()\n" - ] - }, - { - "cell_type": "markdown", - "id": "b178987d", - "metadata": {}, - "source": [ - "### siret" - ] - }, - { - "cell_type": "markdown", - "id": "7efd71f8", - "metadata": {}, - "source": [ - "### rna" - ] - }, - { - "cell_type": "markdown", - "id": "958b9632", - "metadata": {}, - "source": [ - "### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90f19548", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df[[\"STRUCTURE\", \"LIBELLE_SERVICE\"]].sample(20)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cf2f66a1", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.STRUCTURE.isna().sum()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4082e7fd", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.LIBELLE_SERVICE.isna().sum()\n" - ] - }, - { - "cell_type": "markdown", - "id": "6d5bbe5b", - "metadata": {}, - "source": [ - "### commune" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00132cd9", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.LIBELLE_COMMUNE_ADR.isna().sum()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee50f485", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.LIBELLE_COMMUNE_ADR.sample(20).to_frame()\n" - ] - }, - { - "cell_type": "markdown", - "id": "3ad7d271", - "metadata": {}, - "source": [ - "### code_postal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15d542df", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.CODE_POSTAL_ADR.isna().sum()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b54b9b8", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.CODE_POSTAL_ADR.sample(20).to_frame()\n" - ] - }, - { - "cell_type": "markdown", - "id": "c2485bc2", - "metadata": {}, - "source": [ - "### code_insee" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6ee1801c", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.CODE_COMMUNE_ADR.sample(20).to_frame()\n" - ] - }, - { - "cell_type": "markdown", - "id": "d35699ce", - "metadata": {}, - "source": [ - "### adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb13b8d6", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.L4_NUMERO_LIB_VOIE_ADR.isna().sum()\n" - ] - }, - { - "cell_type": "markdown", - "id": "d25d2c35", - "metadata": {}, - "source": [ - "### complement_adresse" - ] - }, - { - "cell_type": "markdown", - "id": "95a1440c", - "metadata": {}, - "source": [ - "### longitude" - ] - }, - { - "cell_type": "markdown", - "id": "201b22f9", - "metadata": {}, - "source": [ - "### latitude" - ] - }, - { - "cell_type": "markdown", - "id": "878bf34b", - "metadata": {}, - "source": [ - "### typologie" - ] - }, - { - "cell_type": "markdown", - "id": "66528ac1", - "metadata": {}, - "source": [ - "### telephone" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04f9a397", - "metadata": {}, - "outputs": [], - "source": [ - "contacts_df = df[\n", - " [\n", - " \"ID_RES\",\n", - " \"ID_CTC\",\n", - " \"TEL_1_CTC\",\n", - " \"TEL_2_CTC\",\n", - " \"FAX_CTC\",\n", - " \"SITE_INTERNET_CTC\",\n", - " \"MAIL_CTC\",\n", - " ]\n", - "]\n", - "contacts_df = contacts_df.drop_duplicates()" - ] - }, - { - "cell_type": "markdown", - "id": "4ba8c30f", - "metadata": {}, - "source": [ - "### courriel" - ] - }, - { - "cell_type": "markdown", - "id": "def1ef3a", - "metadata": {}, - "source": [ - "### site_web" - ] - }, - { - "cell_type": "markdown", - "id": "651ae1b1", - "metadata": {}, - "source": [ - "### presentation_resume" - ] - }, - { - "cell_type": "markdown", - "id": "7c0ff1eb", - "metadata": {}, - "source": [ - "### presentation_detail" - ] - }, - { - "cell_type": "markdown", - "id": "81052997", - "metadata": {}, - "source": [ - "### date_maj" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11996258", - "metadata": {}, - "outputs": [], - "source": [ - "df[\"DATE DERNIERE MAJ\"].value_counts()\n" - ] - }, - { - "cell_type": "markdown", - "id": "a850be11", - "metadata": {}, - "source": [ - "### structure_parente" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "06339db8", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.STRUCTURE.duplicated(keep=False).sum()\n" - ] - }, - { - "cell_type": "markdown", - "id": "764d181d", - "metadata": {}, - "source": [ - "### lien_source" - ] - }, - { - "cell_type": "markdown", - "id": "142f0697", - "metadata": {}, - "source": [ - "### horaires_ouverture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "793ec749", - "metadata": {}, - "outputs": [], - "source": [ - "horaires_df = df.iloc[\n", - " :,\n", - " df.columns.str.contains(\"ID_RES|COMMENTAIRES_HORAIRE_RSP\")\n", - " | df.columns.str.endswith(\"_HOR\"),\n", - "]\n", - "horaires_df = horaires_df.drop_duplicates()\n", - "horaires_df = horaires_df.dropna(subset=[\"JOUR_HOR\"])\n", - "horaires_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b47b126", - "metadata": {}, - "outputs": [], - "source": [ - "horaires_df.sample(10)\n" - ] - }, - { - "cell_type": "markdown", - "id": "de2fc69e", - "metadata": {}, - "source": [ - "### accessibilite" - ] - }, - { - "cell_type": "markdown", - "id": "b802d8fc", - "metadata": {}, - "source": [ - "### labels_nationaux" - ] - }, - { - "cell_type": "markdown", - "id": "d23fd297", - "metadata": {}, - "source": [ - "### labels_autres" - ] - }, - { - "cell_type": "markdown", - "id": "ee71c21b", - "metadata": {}, - "source": [ - "### thematiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9043e244", - "metadata": {}, - "outputs": [], - "source": [ - "familles_df = df[[\"ID_RES\", \"CODE_FAM\", \"FamilleBesoin\"]]\n", - "familles_df = familles_df.drop_duplicates()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c8975d65", - "metadata": {}, - "outputs": [], - "source": [ - "familles_df.ID_RES.value_counts().to_frame().head(20)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "255cdeeb", - "metadata": {}, - "outputs": [], - "source": [ - "familles_df[[\"CODE_FAM\", \"FamilleBesoin\"]].value_counts().to_frame()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d282a2d", - "metadata": {}, - "outputs": [], - "source": [ - "categories_df = df[[\"ID_RES\", \"CODE_CAT\", \"Besoin\"]]\n", - "categories_df = categories_df.drop_duplicates()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "208c052f", - "metadata": {}, - "outputs": [], - "source": [ - "categories_df.ID_RES.value_counts().to_frame().head(20)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27e3e4a9", - "metadata": {}, - "outputs": [], - "source": [ - "categories_df[[\"CODE_CAT\", \"Besoin\"]].value_counts().to_frame()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62a844cd", - "metadata": {}, - "outputs": [], - "source": [ - "sous_categories_df = df[[\"ID_RES\", \"CODE_SSC\", \"Sous besoin\"]]\n", - "sous_categories_df = sous_categories_df.drop_duplicates()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e36b4bae", - "metadata": {}, - "outputs": [], - "source": [ - "sous_categories_df.ID_RES.value_counts().to_frame().head(20)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e711dfa5", - "metadata": {}, - "outputs": [], - "source": [ - "sous_categories_df[[\"CODE_SSC\", \"Sous besoin\"]].value_counts().to_frame()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d61c6db", - "metadata": {}, - "outputs": [], - "source": [ - "all_cat_df = df[\n", - " [\n", - " \"ID_RES\",\n", - " \"CODE_FAM\",\n", - " \"CODE_CAT\",\n", - " \"CODE_SSC\",\n", - " \"FamilleBesoin\",\n", - " \"Besoin\",\n", - " \"Sous besoin\",\n", - " ]\n", - "]\n", - "all_cat_df.drop_duplicates()\n", - "all_cat_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54db44ee", - "metadata": {}, - "outputs": [], - "source": [ - "all_cat_df[\"Besoin\"].drop_duplicates().to_list()\n" - ] - }, - { - "cell_type": "markdown", - "id": "377ea72f", - "metadata": {}, - "source": [ - "# Services" - ] - }, - { - "cell_type": "markdown", - "id": "2a9515c9", - "metadata": {}, - "source": [ - "### id" - ] - }, - { - "cell_type": "markdown", - "id": "9a49dc94", - "metadata": {}, - "source": [ - "### structure_id" - ] - }, - { - "cell_type": "markdown", - "id": "9e0733c9", - "metadata": {}, - "source": [ - "### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d96e5d7e", - "metadata": {}, - "outputs": [], - "source": [ - "ressources_df.DESCRIPTION_SERVICE.map(len)\n" - ] - }, - { - "cell_type": "markdown", - "id": "1aec52bf", - "metadata": {}, - "source": [ - "### Taux de remplissage des champs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9829238c", - "metadata": {}, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "658e6e06", - "metadata": {}, - "outputs": [], - "source": [ - "compute_field_occupancy_rates(df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "3d7558bc", - "metadata": {}, - "source": [ - "Typologie de structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dba27235", - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.schema import models\n", - "\n", - "categories_flags_services_df = (\n", - " services_df.STRUCTURE.str.lower()\n", - " .apply(\n", - " lambda s: {\n", - " models.Typologie.CAF.value: \"caf\" in s.split()\n", - " or (\"caisse\" in s and \"allocation\" in s and \"fami\" in s),\n", - " models.Typologie.CC.value: \"communaut\" in s\n", - " and \"commune\" in s\n", - " and \"maternelle\" not in s,\n", - " models.Typologie.ASSO.value: \"association\" in s.split(),\n", - " models.Typologie.CCAS.value: \"ccas\" in s.split()\n", - " or \"social\" in s\n", - " and \"action\" in s,\n", - " models.Typologie.CHRS.value: \"chrs\" in s.split()\n", - " or (\"bergement\" in s and \"insertion\" in s),\n", - " models.Typologie.RS_FJT.value: (\"sidence\" in s and \"social\" in s)\n", - " or \"fjt\" in s\n", - " or (\"foyer\" in s and \"jeune\" in s and \"travail\" in s),\n", - " models.Typologie.CS.value: \"centre social\" in s,\n", - " models.Typologie.MDS.value: \"maison\" in s and \"solidarit\" in s,\n", - " models.Typologie.ML.value: \"mission\" in s and \"local\" in s,\n", - " models.Typologie.MDPH.value: \"maison\" in s and \"handic\" in s,\n", - " }\n", - " )\n", - " .apply(pd.Series)\n", - " .assign(\n", - " na=lambda df: df.apply(\n", - " lambda row: ~row.any(), axis=\"columns\", result_type=\"expand\"\n", - " )\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b4d3e19", - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_services_df.iloc[:, categories_flags_services_df.columns != \"na\"].any(\n", - " axis=\"columns\"\n", - ").sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f99f834", - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_services_df.sum().sort_values(ascending=False).plot(\n", - " kind=\"bar\", grid=True, rot=35, figsize=(20, 8)\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59a08ad0", - "metadata": {}, - "outputs": [], - "source": [ - "df.iloc[:, df.columns.str.endswith(\"_CTC\")].sample(10)" - ] - }, - { - "cell_type": "markdown", - "id": "597d8cb5", - "metadata": {}, - "source": [ - "Champ `type service partenaire` pour extraire un type de structure ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc8d9c8a", - "metadata": {}, - "outputs": [], - "source": [ - "df.drop_duplicates(subset=\"ID_RES\")[\n", - " \"type service partenaire \"\n", - "].value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "14272e29", - "metadata": {}, - "source": [ - "### Nombre de structures sous-jacentes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f97b05c", - "metadata": {}, - "outputs": [], - "source": [ - "df.STRUCTURE.nunique()" - ] - }, - { - "cell_type": "markdown", - "id": "34ee74a0", - "metadata": {}, - "source": [ - "Aperçu de la distribution du nombres de services par structure\n", - "\n", - "Pour la majorité : un service par structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be5ec0c6", - "metadata": {}, - "outputs": [], - "source": [ - "df.drop_duplicates(\"ID_RES\").STRUCTURE.value_counts().head(10).to_frame()" - ] - }, - { - "cell_type": "markdown", - "id": "28132e5e", - "metadata": {}, - "source": [ - "### Siretisation automatique" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "344d0cb2", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import importlib\n", - "\n", - "import dotenv\n", - "from tqdm.auto import tqdm\n", - "import sqlalchemy as sqla\n", - "\n", - "from data_inclusion.tasks import siretisation\n", - "\n", - "tqdm.pandas()\n", - "\n", - "# reload siretisation without restarting the entire kernel\n", - "importlib.reload(siretisation)\n", - "\n", - "dotenv.load_dotenv(dotenv.find_dotenv())\n", - "\n", - "engine = sqla.create_engine(os.environ[\"SIRENE_DATABASE_URL\"])\n", - "\n", - "structures_df = df.drop_duplicates(\"ID_RES\")\n", - "\n", - "establishments_df = structures_df.progress_apply(\n", - " lambda row: siretisation.search_establishment(\n", - " nom=row.STRUCTURE,\n", - " adresse=row.L4_NUMERO_LIB_VOIE_ADR,\n", - " code_insee=row.CODE_COMMUNE_ADR,\n", - " latitude=row.LATITUDE_ADR,\n", - " longitude=row.LONGITUDE_ADR,\n", - " engine=engine,\n", - " )\n", - " or {},\n", - " axis=\"columns\",\n", - " result_type=\"expand\",\n", - ")\n", - "\n", - "structures_siretisees_df = pd.merge(\n", - " df,\n", - " establishments_df,\n", - " how=\"left\",\n", - " right_index=True,\n", - " left_index=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "e1202152", - "metadata": {}, - "source": [ - "#### Résultats de la siretisation automatique" - ] - }, - { - "cell_type": "markdown", - "id": "64a34ead", - "metadata": {}, - "source": [ - "% de structures siretisées" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c509c5ea", - "metadata": {}, - "outputs": [], - "source": [ - "structures_siretisees_df.drop_duplicates(\n", - " \"STRUCTURE\"\n", - ").siret.notna().sum() * 100 / structures_siretisees_df.drop_duplicates(\n", - " \"STRUCTURE\"\n", - ").shape[\n", - " 0\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "7a881b52", - "metadata": {}, - "source": [ - "% de services avec structures siretisées\n", - "\n", - "Certaines structures ont un nombre important de services." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d9baf9b8", - "metadata": {}, - "outputs": [], - "source": [ - "structures_siretisees_df.siret.notna().sum() * 100 / structures_siretisees_df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "id": "f5163edb", - "metadata": {}, - "source": [ - "établissement vs structures" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59fefa2f", - "metadata": {}, - "outputs": [], - "source": [ - "structures_siretisees_df[structures_siretisees_df.siret.notna()][\n", - " [\"ID_RES\", \"STRUCTURE\", \"siret\", \"name\"]\n", - "]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/odspep/normalize-2023.ipynb b/analyse/notebooks/odspep/normalize-2023.ipynb deleted file mode 100644 index cb025f7f..00000000 --- a/analyse/notebooks/odspep/normalize-2023.ipynb +++ /dev/null @@ -1,267 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "d409edbf", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import dotenv\n", - "from pathlib import Path\n", - "import os\n", - "\n", - "basepath=Path('/home/colin/data/gip/pole-emploi/')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "967fd250", - "metadata": {}, - "outputs": [], - "source": [ - "# pd.options.display.max_rows = None\n", - "# pd.options.display.max_columns = None\n", - "\n", - "# dotenv.load_dotenv(dotenv.find_dotenv())\n", - "# %load_ext dotenv\n", - "# %dotenv\n", - "\n", - "import env" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb0d7f1f", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ['SCALEWAY_ACCESS']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6eaf148", - "metadata": {}, - "outputs": [], - "source": [ - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36d92ef2", - "metadata": {}, - "outputs": [], - "source": [ - "# Testing S3 and Minio\n", - "\n", - "s3_client = Minio(os.environ[\"SCALEWAY_URL\"],\n", - " access_key=os.environ[\"SCALEWAY_ACCESS\"],\n", - " secret_key=os.environ[\"SCALEWAY_SECRET\"],\n", - " region=os.environ['MINIO_REGION']\n", - " )\n", - "\n", - "objects = s3_client.list_objects(os.environ['SCALEWAY_BUCKET'], prefix=\"sources/odspep/2023-01-23/denormalized/Exports/\")" - ] - }, - { - "cell_type": "markdown", - "id": "f8c61142", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42d0f2df", - "metadata": {}, - "outputs": [], - "source": [ - "from minio import Minio\n", - "\n", - "def get_from_s3(filepath: str) -> Path:\n", - " s3_client = Minio(os.environ[\"SCALEWAY_URL\"],\n", - " access_key=os.environ[\"SCALEWAY_ACCESS\"],\n", - " secret_key=os.environ[\"SCALEWAY_SECRET\"],\n", - " region=os.environ['MINIO_REGION']\n", - " )\n", - " try:\n", - " data = s3_client.get_object(environ['SCALEWAY_BUCKET'], filepath)\n", - " # Save the file locally with same name\n", - " local_filepath: Path = Path(os.getcwd()) / filepath.split(\"/\")[-1]\n", - " print(local_filepath)\n", - " with open(local_filepath, 'wb') as file_data:\n", - " for d in data.stream(32*1024):\n", - " file_data.write(d)\n", - " finally:\n", - " data.close()\n", - " data.release_conn()\n", - " return local_filepath\n", - "\n", - "\n", - "import warnings\n", - "def read_odspep_excel(name: str) -> pd.DataFrame:\n", - " with warnings.catch_warnings(record=True):\n", - " warnings.simplefilter(\"always\")\n", - " return pd.read_excel(\n", - " get_from_s3(f\"sources/odspep/2023-01-23/denormalized/Exports/DD009_{name}.xlsx\"), dtype=str)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c4e5f63d", - "metadata": {}, - "source": [ - "### RES_PARTENARIALE => ressources.csv\n", - "\n", - "Fetch raw ressources data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9511d550", - "metadata": {}, - "outputs": [], - "source": [ - "df_res_partenariale = read_odspep_excel(\"RES_PARTENARIALE\")\n", - "print(df_res_partenariale.columns)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de7bf1dc", - "metadata": {}, - "outputs": [], - "source": [ - "df_res_partenariale['']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fbdd79d", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "df_res_partenariale.rename(inplace=True,columns={\n", - " \"SERVICE_RSP\": \"LIBELLE_SERVICE\",\n", - " \"SERVICE_DESCRIPTION_RSP\": \"DESCRIPTION_SERVICE\",\n", - "})\n", - "\n", - "print(list(df_res_partenariale.columns))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "e903d0be", - "metadata": {}, - "source": [ - "Fetch raw adresse data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13559868", - "metadata": {}, - "outputs": [], - "source": [ - "df_adresse = pd.read_excel(\n", - " ,\n", - " dtype=str, \n", - " )\n", - " \n", - "print(list(df_res_partenariale.columns))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c3104382", - "metadata": {}, - "outputs": [], - "source": [ - "resources_target_columns = pd.read_csv(basepath / \"2022-11-23/normalized/ressources.csv\", sep=\"|\").columns\n", - "resources_target_columns = list(resources_target_columns)\n", - "print(resources_target_columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea9cc01e", - "metadata": {}, - "outputs": [], - "source": [ - "df_ressources = pd.read_csv(basepath / \"2022-11-23/normalized/ressources.csv\", sep=\"|\")\n", - "df_ressources['SERVICE_RSP'].head(\n", - " \n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85319497", - "metadata": {}, - "outputs": [], - "source": [ - "df_res_partenariale.to_csv(\n", - " \"ressources.csv\",\n", - " sep=\"|\",\n", - " index=False)\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - }, - "vscode": { - "interpreter": { - "hash": "dd946d6ba280afe0edaacee9742095166062224f90f077bd96e57e8739365e9a" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/odspep/normalize.ipynb b/analyse/notebooks/odspep/normalize.ipynb deleted file mode 100644 index 47953984..00000000 --- a/analyse/notebooks/odspep/normalize.ipynb +++ /dev/null @@ -1,256 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import defaultdict\n", - "\n", - "import dotenv\n", - "import numpy as np\n", - "import pandas as pd\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "pd.options.display.max_columns = None\n", - "\n", - "dotenv.load_dotenv(dotenv.find_dotenv())\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ce notebook permet de re-normaliser les données extraites." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def normalize_dataframe(df: pd.DataFrame) -> dict[str, pd.DataFrame]:\n", - " df_by_tbl_name = {}\n", - "\n", - " df = df.replace([np.nan, \"\"], None)\n", - "\n", - " ressources_df = df.iloc[\n", - " :,\n", - " (df.columns == \"ID_RES\")\n", - " | (df.columns == \"STRUCTURE\")\n", - " | (df.columns == \"LIBELLE_SERVICE\")\n", - " | (df.columns == \"DESCRIPTION_SERVICE\")\n", - " | (df.columns == \"DATE DERNIERE MAJ\")\n", - " | (df.columns == \"SERVICE_RSP\")\n", - " | df.columns.str.endswith(\"_ADR\")\n", - " | df.columns.str.endswith(\"_PHY\")\n", - " ]\n", - " ressources_df = ressources_df.drop_duplicates(subset=\"ID_RES\")\n", - " df_by_tbl_name[\"ressources\"] = ressources_df\n", - "\n", - " contacts_df = df[\n", - " [\n", - " \"ID_RES\",\n", - " \"ID_CTC\",\n", - " \"TEL_1_CTC\",\n", - " \"TEL_2_CTC\",\n", - " \"FAX_CTC\",\n", - " \"SITE_INTERNET_CTC\",\n", - " \"MAIL_CTC\",\n", - " ]\n", - " ]\n", - " contacts_df = contacts_df.drop_duplicates()\n", - " df_by_tbl_name[\"contacts\"] = contacts_df\n", - "\n", - " horaires_df = df.iloc[\n", - " :,\n", - " (df.columns == \"ID_RES\")\n", - " | (df.columns == \"COMMENTAIRES_HORAIRE_RSP\")\n", - " | df.columns.str.endswith(\"_HOR\"),\n", - " ]\n", - " horaires_df = horaires_df.drop_duplicates()\n", - " horaires_df = horaires_df.dropna(subset=[\"JOUR_HOR\"])\n", - " df_by_tbl_name[\"horaires\"] = horaires_df\n", - "\n", - " familles_df = df[[\"ID_RES\", \"CODE_FAM\", \"FamilleBesoin\"]]\n", - " familles_df = familles_df.drop_duplicates()\n", - " df_by_tbl_name[\"familles\"] = familles_df\n", - "\n", - " categories_df = df[[\"ID_RES\", \"CODE_CAT\", \"Besoin\"]]\n", - " categories_df = categories_df.drop_duplicates()\n", - " df_by_tbl_name[\"categories\"] = categories_df\n", - "\n", - " sous_categories_df = df[[\"ID_RES\", \"CODE_SSC\", \"Sous besoin\"]]\n", - " sous_categories_df = sous_categories_df.drop_duplicates()\n", - " df_by_tbl_name[\"sous_categories\"] = sous_categories_df\n", - "\n", - " return df_by_tbl_name\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dfs = []\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dfs.append(\n", - " pd.read_excel(\n", - " \"https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDORA14092022.xlsx\",\n", - " dtype=str,\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dfs.append(\n", - " pd.read_excel(\n", - " \"https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDora0311-1.xlsx\",\n", - " dtype=str,\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dfs.append(\n", - " pd.read_excel(\n", - " \"https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDora0311-2.xlsx\",\n", - " dtype=str,\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dfs.append(\n", - " pd.read_excel(\n", - " \"https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDora0311-3.xlsx\",\n", - " dtype=str,\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dfs.append(\n", - " pd.read_excel(\n", - " \"https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/odspep/2022-11-23/denormalized/exportDora0311-4.xlsx\",\n", - " dtype=str,\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for df in dfs[1:]:\n", - " df[\"DATE DERNIERE MAJ\"] = pd.to_datetime(\n", - " df[\"MAX_DATE\"].astype(float), unit=\"D\", origin=\"1899-12-30\"\n", - " ).map(lambda dt: dt.isoformat(sep=\" \", timespec=\"seconds\"))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_by_tbl_name = defaultdict(pd.DataFrame)\n", - "\n", - "for df in dfs:\n", - " local_df_by_tbl_name = normalize_dataframe(df)\n", - "\n", - " for tbl_name, df in local_df_by_tbl_name.items():\n", - " df_by_tbl_name[tbl_name] = pd.concat([df_by_tbl_name[tbl_name], df])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for tbl_name, df in df_by_tbl_name.items():\n", - " print(tbl_name, df.duplicated(subset=[\"ID_RES\"], keep=False).sum())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for tbl_name, df in df_by_tbl_name.items():\n", - " df.to_csv(\n", - " f\"{tbl_name}.csv\",\n", - " index=False,\n", - " sep=\"|\",\n", - " )\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.4 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/reseau-alpha/.gitignore b/analyse/notebooks/reseau-alpha/.gitignore deleted file mode 100644 index d1452f15..00000000 --- a/analyse/notebooks/reseau-alpha/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -structures -services -*.html \ No newline at end of file diff --git a/analyse/notebooks/reseau-alpha/extract.ipynb b/analyse/notebooks/reseau-alpha/extract.ipynb deleted file mode 100644 index 07722a2c..00000000 --- a/analyse/notebooks/reseau-alpha/extract.ipynb +++ /dev/null @@ -1,331 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Scraping des structures et services publiés sur le site Web de Réseau alpha\n", - "\n", - "Le scraping commence sur cette page pour l'Essonne : https://www.reseau-alpha.org/trouver-une-formation?form%5BcodePostal%5D%5B%5D=%7C91&form%5BcriteresScolarisation%5D=&form%5BniveauLinguistiqueVise%5D=&form%5Bprogramme%5D=&form%5BmotCle%5D=\n", - "\n", - "Cette page est générée dynamiquement et Scrapy ne peut donc pas en extraire le contenu. Le HTML doit donc être extrait à la main et sauvegardé dans le même dossier que ce notebook sous le nom `structure_list.html`.\n", - "\n", - "Le script permet de scraper une copie locale du HTML pour les formations et les structures. C'est utile pour tester le script sans envoyer de requêtes au site Web original. Pour ce faire :\n", - "\n", - "1. Faire tourner au moins une fois le scrap avec RESEAU_ALPHA_TEST_W_LOCAL_FILES=0 pour télécharger le HTML depuis le site live sur l'ordinateur dans les dossiers `./structures` et `./services`\n", - "2. Set RESEAU_ALPHA_TEST_W_LOCAL_FILES=1\n", - "\n", - "### Structure du script\n", - "\n", - "1. `start_requests` démarre le scraping à partir de la page de résultats de rechercher\n", - "2. `parse` parse cette page pour extraire la liste des formations (pas encore les permanences)\n", - "3. `parse_formation` scrape le contenu de la page de chaque formation et passe le dictionnaire item à la fonction suivante\n", - "4. `parse_structure` scrape la page de la structure liée à la formation en enrichissant le dictionnaire item. Cette fonction est appelée autant de fois qu'il y a de lieux pour la formation\n", - "5. à la fin de `parse_structure`, le dictionnaire item est \"yield\" pour former une ligne du CSV (ou un objet dans l'array JSON)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import scrapy\n", - "from scrapy.crawler import CrawlerProcess\n", - "from pathlib import Path\n", - "from urllib.parse import urlparse \n", - "import re\n", - "import dateparser\n", - "import os\n", - "import dotenv\n", - "import trafilatura\n", - "\n", - "dotenv.load_dotenv(dotenv.find_dotenv())\n", - "TESTING_WITH_LOCAL_FILES = os.getenv(\"ENV_VAR\", 'False').lower() in ('true', '1', 't')\n", - "\n", - "# Local HTML\n", - "base_path = 'file://' + os.path.abspath('')\n", - "structure_base_path = base_path + '/structures'\n", - "formation_base_path = base_path + '/services'\n", - "\n", - "\n", - "\n", - "URL = f\"{base_path}/structure_list.html\"\n", - "if TESTING_WITH_LOCAL_FILES is False:\n", - " os.makedirs(structure_base_path, exist_ok=True)\n", - " os.makedirs(formation_base_path, exist_ok=True)\n", - "\n", - "# Live HTML (don't use too much to avoid being banned!)\n", - "# structure_base_url = 'https://www.reseau-alpha.org/structure/apprentissage-du-francais/'\n", - "\n", - "\n", - "# Structure avec antennes et formations : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries\n", - "# Structure sans antenne et sans formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/acafi\n", - "# Formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries/formation/francais-a-visee-professionnelle/b8a73-francais-a-visee-sociale-et-ou-professionnelle\n", - "\n", - "def html_to_markdown(s: str):\n", - " if s is None or s == \"\" :\n", - " return s\n", - " if type(s) == list:\n", - " s = \"
\".join(s)\n", - " return trafilatura.extract(trafilatura.load_html(\"\" + s + \"\"), no_fallback=True, max_tree_size=1000)\n", - "\n", - "def clean_adresse(adresses: list or scrapy.Selector) -> {} or []:\n", - " lieux = []\n", - " for adresse in adresses:\n", - " adresse_text_chunks = adresse.xpath('text()').getall()\n", - " clean_lieu = {\n", - " \"structure_service_adresse_entiere\": \"\",\n", - " \"structure_service_adresse\": \"\",\n", - " \"structure_service_code_postal\": \"\",\n", - " \"structure_service_commune\": \"\"\n", - " }\n", - " for part in adresse_text_chunks:\n", - " part = part.strip()\n", - " if re.match(r'^\\d', part):\n", - " if re.match(r'^\\d{5}', part):\n", - " split_address = part.split(\" - \")\n", - " clean_lieu[\"structure_service_code_postal\"] = split_address[0]\n", - " clean_lieu[\"structure_service_commune\"] = split_address[1]\n", - " else:\n", - " clean_lieu[\"structure_service_adresse\"] = part\n", - " clean_lieu[\"structure_service_adresse_entiere\"] += part + \", \"\n", - " lieux.append(clean_lieu)\n", - " return lieux\n", - "\n", - "def strip(maybe_string):\n", - " if type(maybe_string) == str:\n", - " return maybe_string.strip()\n", - " if maybe_string == None:\n", - " return \"\"\n", - " else:\n", - " return maybe_string" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class AlphaSpider(scrapy.Spider):\n", - " name = \"alpha\"\n", - " custom_settings = {\n", - " \"DOWNLOAD_DELAY\": 0 if TESTING_WITH_LOCAL_FILES else 0.5\n", - " }\n", - "\n", - " def start_requests(self):\n", - " urls = [\n", - " URL\n", - " ]\n", - " for url in urls:\n", - " yield scrapy.Request(url=url, callback=self.parse)\n", - "\n", - " def parse(self, response):\n", - " \n", - " formations_links = response.css('div#div-accordion-formation > div.contact-content a.readon')\n", - " \n", - " if TESTING_WITH_LOCAL_FILES:\n", - " for slug in formations_links.xpath('@href').getall():\n", - " next_page = f\"{formation_base_path}/{slug.split('/')[-1]}\"\n", - " yield scrapy.Request(next_page, callback=self.parse_formation)\n", - " else:\n", - " for a in formations_links:\n", - " yield response.follow(a, callback=self.parse_formation)\n", - "\n", - "\n", - " def parse_formation(self, response):\n", - "\n", - " if TESTING_WITH_LOCAL_FILES is False:\n", - " # Downloading HTML content\n", - " page = response.url.split(\"/\")[-1]\n", - " # Path doesn't deal with file:// URIs\n", - " filepath = Path(formation_base_path[7:]) / page\n", - " filepath.write_bytes(response.body)\n", - "\n", - " formation_entete = response.css('div.entete')\n", - " formation_contenu = response.css('div.entete + div')\n", - " formation_contenu_col1 = response.css('div.entete + div > div:nth-child(1)')\n", - " formation_contenu_col2 = response.css('div.entete + div > div:nth-child(2)')\n", - " formation_inscription_info = formation_contenu_col2.css('div:nth-of-type(1)')\n", - " formation_inscription_contact = formation_contenu_col2.css('div:nth-of-type(2)')\n", - " formation_informations_pratiques = formation_contenu_col2.css('div:nth-of-type(3)')\n", - " formation_lieux_horaires = response.css('div#lieux-formation')\n", - "\n", - "\n", - " # SERVICE\n", - " item = {}\n", - "\n", - " # Nom\n", - " service_nom_1 = strip(response.css(\"div.titre-element > strong::text\").get())\n", - " service_nom_2 = strip(response.css(\"a.underline.red-alpha + div::text\").get())\n", - " item[\"nom\"] = f\"{service_nom_1} ({service_nom_2})\"\n", - "\n", - " # Date de màj\n", - " date_maj_fr = strip(response.css(\"a.underline.red-alpha + div + div::text\").get().split(\":\")[-1])\n", - " item[\"date_maj\"] = dateparser.parse(date_maj_fr).isoformat()\n", - " \n", - " # Description\n", - " contenu_objectif_public = formation_contenu_col1.css(\".row div\").getall()\n", - " contenu_objectif_public += formation_informations_pratiques.get()\n", - " # les descriptions sont très longues et rendent difficiles le test des autres champs\n", - " # item[\"presentation_detail\"] = html_to_markdown(contenu_objectif_public)\n", - "\n", - " # Lien vers la source\n", - " item[\"lien_source\"] = response.url\n", - "\n", - " # Courriel\n", - " item[\"courriel\"] = strip(formation_inscription_contact.css('div.email.red-alpha > a::attr(href)').get()).split(\":\")[-1]\n", - "\n", - " # Adresse\n", - " clean_lieux = clean_adresse(formation_lieux_horaires.css(\"div.adresse\"))\n", - "\n", - " # Téléphone\n", - " item[\"telephone\"] = \"\"\n", - " \n", - " # Contact nom prénom\n", - " item[\"contact_nom_prenom\"] = \"\"\n", - "\n", - " # Thématiques\n", - " item[\"thematiques\"] = [\"apprendre-francais--suivre-formation\"]\n", - " if service_nom_2 == \"Français à visée professionnelle\":\n", - " item[\"thematiques\"].append(\"apprendre-francais--accompagnement-insertion-pro\")\n", - " if service_nom_2 == \"Français à visée sociale et communicative\":\n", - " item[\"thematiques\"].append(\"apprendre-francais--communiquer-vie-tous-les-jours\")\n", - "\n", - " # Hard coded fields\n", - " item[\"zone_diffusion_type\"] = \"departement\"\n", - " item[\"zone_diffusion_code\"] = \"91\"\n", - " item[\"zone_diffusion_nom\"] = \"Essonne\"\n", - " item[\"types\"] = [\"formation\"]\n", - " item[\"cumulable\"] = True\n", - " item[\"contact_public\"] = True\n", - " item[\"modes_accueil\"] = [\"en-presentiel\"]\n", - "\n", - " \n", - " # STRUCTURE\n", - " # ID de la structure\n", - " structure_link_element = formation_entete.css(\"div.titre-element ~ a.underline.red-alpha\")\n", - " item[\"structure_id\"] = structure_link_element.xpath(\"@href\").get().split(\"/\")[-1]\n", - " if TESTING_WITH_LOCAL_FILES:\n", - " structure_link = f\"{structure_base_path}/{item['structure_id']}\"\n", - " else:\n", - " structure_link = structure_link_element.xpath(\"@href\").get()\n", - " \n", - " \n", - "\n", - " # Une ligne/record de service et une structure par lieu\n", - " service_id_suffix = 1\n", - " for lieu in clean_lieux:\n", - " # Id\n", - " item[\"id\"] = f\"{response.url.split('/')[-1]}_{str(service_id_suffix)}\"\n", - " service_id_suffix += 1\n", - " print(lieu)\n", - " item = item | lieu\n", - " yield scrapy.Request(structure_link, callback=self.parse_structure, meta={\"item\": item}, dont_filter=True)\n", - " \n", - " def parse_structure(self, response):\n", - " if TESTING_WITH_LOCAL_FILES is False:\n", - " # Downloading HTML content\n", - " page = response.url.split(\"/\")[-1]\n", - " # Path doesn't deal with file:// URIs\n", - " filepath = Path(structure_base_path[7:]) / page\n", - " filepath.write_bytes(response.body)\n", - "\n", - " item = response.meta.get(\"item\")\n", - " \n", - "\n", - " # Nom\n", - " item[\"structure_nom\"] = strip(response.css('div#structure > strong::text').get())\n", - "\n", - " # Data màj\n", - " item[\"structure_date_maj\"] = strip(response.css('div.structures-dates > div:nth-child(2)').xpath('text()').get())\n", - " item[\"structure_date_maj\"] = item[\"structure_date_maj\"].split(\" : \")[-1]\n", - " item[\"structure_date_maj\"] = dateparser.parse(item[\"structure_date_maj\"]).isoformat()\n", - "\n", - " # Adresse\n", - " # Sur le site Web, une structure a autant d'adresses qu'elle a de lieux pour ses services\n", - " # Certains services sont proposés sur toutes les adresses de la structure, certains non.\n", - "\n", - " # Téléphone\n", - " telephone = response.css('div.lieu div.telephone > a::attr(href)').get()\n", - " if type(telephone) == str:\n", - " # Les numéro de téléphone sont préfixés par tel:\n", - " telephone = telephone.strip()[4:]\n", - " else:\n", - " telephone = \"\"\n", - " item[\"structure_telephone\"] = telephone\n", - " \n", - " # Site Web\n", - " item[\"structure_site_web\"] = strip(response.css('div.lieu div.facebook::text').get())\n", - "\n", - " # Lien source\n", - " item[\"structure_lien_source\"] = response.url\n", - "\n", - " # Labels\n", - " item[\"structure_labels_autres\"] = [\"reseau-alpha\"]\n", - "\n", - " # Thématiques\n", - " item[\"structure_thematiques\"] = [\"apprendre-francais--suivre-formation\"]\n", - "\n", - "\n", - " yield item\n", - "\n", - " \n", - "process = CrawlerProcess(settings={\n", - " \"FEEDS\": {\n", - " # Seul le JSON est utilisable dans le pipeline car le CSV imprime les listes sans square brackets ([])\n", - " # Le CSV est pratique pour tester\n", - " \"alpha.json\": {\n", - " \"format\": \"json\",\n", - " \"overwrite\": True,\n", - " \"ensure_ascii\": False,\n", - " 'encoding': 'utf8',\n", - " 'store_empty': False,\n", - " },\n", - " \"alpha.csv\": {\n", - " \"format\": \"csv\",\n", - " \"overwrite\": True,\n", - " 'encoding': 'utf8',\n", - " },\n", - " },\n", - "})\n", - "process.crawl(AlphaSpider)\n", - "process.start()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.read_csv('./alpha.csv', dtype = str, index_col=None)\n", - "df.info()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv-analyse", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/siao/analyse.ipynb b/analyse/notebooks/siao/analyse.ipynb deleted file mode 100644 index f28738b5..00000000 --- a/analyse/notebooks/siao/analyse.ipynb +++ /dev/null @@ -1,379 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "pd.options.display.max_rows = None" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import dotenv\n", - "\n", - "dotenv.load_dotenv(dotenv.find_dotenv())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_excel(os.environ.get(\"SIAO_FILE_URL\"), dtype=str)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nettoyage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.replace([np.nan, \"\"], None)\n", - "df[\"Code SIRET\"] = df[\"Code SIRET\"].replace(r\"\\.0\", \"\", regex=True).replace(r\"\\D\", \"\", regex=True).replace(\"\", None).apply(lambda s: s and f\"{s:0>14}\").replace(r\"0{14}\", None, regex=True)\n", - "df[\"Code postal\"] = df[\"Code postal\"].replace(r\"\\.0\", \"\", regex=True).replace(r\"\\D\", \"\", regex=True).replace(\"\", None).apply(lambda s: s and f\"{s:0>5}\")\n", - "df[\"FINESS\"] = df[\"FINESS\"].replace(r\"\\.0\", \"\", regex=True).replace(r\"\\D\", \"\", regex=True).replace(\"\", None).apply(lambda s: s and f\"{s:0>9}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Description générale\n", - "\n", - "* orienté services\n", - "* pas de code insee --> à géocoder ou via base sirene géolocalisée\n", - "* pas mal de cleanup à faire --> est-ce qu'il est possible d'obtenir le fichier dans un autre format qu'excel ?\n", - "* pas de champ date de maj\n", - "* pas de champ pour identifier de manière unique et globale les structure sous-jacentes.\n", - "* le champ \"Nom de la structure\" est en fait à mi-chemin entre le nom du service et de la structure.\n", - "* pb: lorsque le SIRET est dupliqué, il n'y a pas de moyen fiable pour distinguer les lignes qui sont communes à une structure de celles qui sont des antennes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de lignes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de lignes avec un numéro FINESS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"FINESS\"].notna()].shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de lignes avec un numéro FINESS ou un siret" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"FINESS\"].notna() | df[\"Code SIRET\"].notna()].shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de lignes avec un siret sans FINESS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[~df[\"FINESS\"].notna() & df[\"Code SIRET\"].notna()].shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de lignes avec FINESS sans siret" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"FINESS\"].notna() & ~df[\"Code SIRET\"].notna()].shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Code SIRET\"].value_counts().apply(lambda c: c == 1).sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "taux de remplissage des champs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)\n", - "\n", - "compute_field_occupancy_rates(df).to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Champs pour extraire un type de structure ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Type\"].value_counts().to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"Catégorie de structure\"].value_counts().to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "structures_df = df[df[\"Code SIRET\"].isna()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df[\"Code SIRET\"].notna()][\"Code SIRET\"].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_structures_df = (\n", - " structures_df[\"Nom de la structure\"]\n", - " .str.lower()\n", - " .apply(\n", - " lambda s: {\n", - " \"asso\": \"association\" in s or \"asso\" in s.split() or \"ass\" in s.split() or \"association\" in s.split(), # ASSO\n", - " \"ccas\": \"ccas\" in s, # CCAS\n", - " \"mission_locale\": \"mission\" in s and \"locale\" in s, # ML\n", - " \"cada\": \"cada\" in s.split(), # CADA\n", - " \"chrs\": \"chrs\" in s or \"c.h.r.s\" in s, # CHRS\n", - " \"chu\": \"chu\" in s.split(), # CHU\n", - " \"cph\": \"cph\" in s.split() or \"c.p.h\" in s, # CPH\n", - " \"huda\": \"huda\" in s or \"h.u.d.a\" in s, # HUDA\n", - " \"spip\": \"spip\" in s or \"s.p.i.p\" in s or \"Pénitentiaire\" in s, # SPIP\n", - " \"pjj\": \"pjj\" in s or \"p.j.j\" in s or \"protection judiciaire\" in s, # PJJ\n", - " \"fjt\": \"fjt\" in s or \"f.j.t\" in s or (\"sidence\" in s and \"sociale\" in s) or s.startswith(\"rs \"), # RS_FJT\n", - " \"udaf\": \"udaf\" in s, # UDAF\n", - " \"plie\": \"plie\" in s.split(), # PLIE\n", - " \"centre_social\": \"centre\" in s and \"social\" in s, # CS\n", - " \"cias\": \"cias\" in s.split(), # CIAS\n", - " \"cava\": \"cava\" in s.split(), # CAVA\n", - " \"muni\": \"mairie\" in s and \"ccas\" not in s, # MUNI\n", - " \"caarud\": \"caarud\" in s or \"c.a.a.r.u.d\" in s, # CAARUD\n", - " \"pe\": s.startswith(\"pole emploi\"), # PE\n", - " },\n", - " )\n", - " .apply(pd.Series)\n", - " .assign(\n", - " na=lambda df: df.apply(\n", - " lambda row: ~row.any(), axis=\"columns\", result_type=\"expand\"\n", - " )\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_structures_df.sum().sort_values(ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_structures_df.iloc[:, categories_flags_structures_df.columns != \"na\"].any(axis=\"columns\").sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_structures_df.sum().sort_values(ascending=False).plot(\n", - " kind=\"bar\", grid=True, rot=35, figsize=(20, 8)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Lien avec la base FINESS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "finess_df = pd.read_csv(\"https://www.data.gouv.fr/fr/datasets/r/3dc9b1d5-0157-440d-a7b5-c894fcfdfd45\", dtype=str)\n", - "finess_df = finess_df.replace([\"\", np.nan], None)\n", - "finess_df = finess_df[finess_df.siret.notna()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "merged_df = pd.merge(df, finess_df[[\"nofinesset\", \"siret\"]], how=\"left\", left_on=\"FINESS\", right_on=\"nofinesset\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# lignes avec un siret retrouvé grâce à la base FINESS\n", - "merged_df[merged_df.siret.notna() & merged_df[\"Code SIRET\"].isna()].shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# structures avec un siret retrouvé grâce à la base FINESS\n", - "merged_df[merged_df.siret.notna() & merged_df[\"Code SIRET\"].isna()].siret.nunique()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/soliguide/.gitignore b/analyse/notebooks/soliguide/.gitignore deleted file mode 100644 index 5d315d58..00000000 --- a/analyse/notebooks/soliguide/.gitignore +++ /dev/null @@ -1 +0,0 @@ -soliguide-places-*.json \ No newline at end of file diff --git a/analyse/notebooks/soliguide/analyse_2022.ipynb b/analyse/notebooks/soliguide/analyse_2022.ipynb deleted file mode 100644 index 8f5a4534..00000000 --- a/analyse/notebooks/soliguide/analyse_2022.ipynb +++ /dev/null @@ -1,495 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import date\n", - "from dateutil.parser import parse as dateutil_parse\n", - "import os\n", - "from pathlib import Path\n", - "import importlib\n", - "\n", - "import dotenv\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import soliguide\n", - "\n", - "importlib.reload(soliguide)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "plt.rc(\"figure\", figsize=[12, 8])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load token from dot `.env` in this notebook directory\n", - "dotenv.load_dotenv(dotenv.find_dotenv())\n", - "\n", - "soliguide_api_client = soliguide.APIClient(\n", - " base_url=\"https://api.soliguide.fr/\",\n", - " token=os.environ[\"SOLIGUIDE_API_TOKEN\"],\n", - " user_agent=\"betaTest\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_places_data = soliguide_api_client.search(\n", - " location_geo_type=\"pays\", location_geo_value=\"france\"\n", - ")\n", - "\n", - "all_places_df = pd.DataFrame.from_records(data=all_places_data)\n", - "all_places_df.to_json(f\"./soliguide-places-{date.today().strftime('%Y%m%d')}.json\", orient=\"records\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nettoyage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get the latest downloaded data\n", - "data_file_path = sorted(Path(\".\").glob(\"soliguide-places-*.json\"))[-1]\n", - "\n", - "all_places_df = pd.read_json(data_file_path)\n", - "all_places_df = pd.json_normalize(all_places_df.to_dict(orient=\"records\"))\n", - "all_places_df = all_places_df.set_index(\"lieu_id\")\n", - "all_places_df.createdAt = all_places_df.createdAt.apply(lambda s: dateutil_parse(s))\n", - "all_places_df.updatedAt = all_places_df.updatedAt.apply(lambda s: dateutil_parse(s))\n", - "all_places_df = all_places_df.replace([np.nan, \"\"], None)\n", - "all_places_df.sample(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nombres de lieux uniques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_places_df.reset_index().lieu_id.nunique()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nombre de lieux en France" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_places_df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Taux de remplissage des champs de structures" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_field_occupancy_rates(df):\n", - " return ((1 - df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import display\n", - "for _, df in compute_field_occupancy_rates(all_places_df).groupby(lambda c: c.split(\".\")[0]):\n", - " display(df.to_frame())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Distribution de la date de 1er référencement des lieux" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_places_df.createdAt.hist(bins=100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Distribution de la date de dernière mise-à-jour" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_places_df.updatedAt.hist(bins=100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Typologie\n", - "\n", - "* Pas de champs typologie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_places_df = (\n", - " all_places_df.name.str.lower()\n", - " .apply(\n", - " lambda s: {\n", - " \"restos_du_c\": \"restos du c\" in s,\n", - " \"epicerie\": \"epicerie\" in s,\n", - " \"caf\": \"caf\" in s,\n", - " \"ccas\": \"ccas\" in s,\n", - " \"pole_emploi\": \"pôle emploi\" in s,\n", - " \"secours_populaire\": \"secours populaire\" in s,\n", - " \"secours_catholique\": \"secours catholique\" in s,\n", - " \"mairie\": \"mairie\" in s,\n", - " \"commune\": \"commune\" in s,\n", - " \"association\": \"association\" in s,\n", - " \"mission_locale\": \"mission locale\" in s,\n", - " \"action_emploi\": \"action emploi\" in s,\n", - " \"caarud\": \"caarud\" in s,\n", - " \"croix_rouge\": \"croix\" in s and \"rouge\" in s,\n", - " \"pmi\": \"pmi\" in s,\n", - " \"*thèque\": \"médiathèque\" in s or \"bibliothèque\" in s,\n", - " \"mjd\": \"mjd\" in s,\n", - " \"france_services\": \"espace france services\" in s,\n", - " \"cidff\": \"cidff\" in s,\n", - " \"médiavipp\": \"médiavipp\" in s,\n", - " \"nouvelles_voies\": \"nouvelles voies\" in s,\n", - " \"adil\": \"adil\" in s,\n", - " \"maison_emploi\": \"maison\" in s and \"emploi\" in s,\n", - " },\n", - " )\n", - " .apply(pd.Series)\n", - " .assign(\n", - " na=lambda df: df.apply(\n", - " lambda row: ~row.any(), axis=\"columns\", result_type=\"expand\"\n", - " )\n", - " )\n", - ")\n", - "\n", - "categories_flags_places_df.sum().sort_values(ascending=False).plot(kind=\"bar\", grid=True, rot=35, figsize=(20, 8))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_places_df.drop(columns=[\"na\"]).sum().sort_values(ascending=False).plot(\n", - " kind=\"bar\", grid=True, rot=35, figsize=(20, 8)\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Répartition géographique" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_places_df[\"position.departement\"].value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nombre de lieux par status sur soliguide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_places_df.statut.value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Services" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_services_df = pd.json_normalize(\n", - " all_places_df.reset_index().to_dict(orient=\"records\"),\n", - " record_path=\"services_all\",\n", - " meta=[\"lieu_id\"],\n", - ")\n", - "all_services_df = all_services_df.replace([np.nan, \"\"], None)\n", - "all_services_df.sample(5)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nombre de services" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# nombre de données services\n", - "all_services_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# nombre de services uniques\n", - "all_services_df.serviceObjectId.nunique()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nombre de services fermés" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "(all_services_df[\"close.actif\"] == True).sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Nombre de services par typologie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# par catégories\n", - "all_services_df.categorie.apply(lambda code: soliguide.categories_by_subcategories[code]).value_counts().to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# par sous-catégories\n", - "all_services_df.categorie.apply(lambda code: soliguide.categories[code]).value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Taux de remplissage des champs services" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import display\n", - "for _, df in compute_field_occupancy_rates(all_services_df).groupby(lambda c: c.split(\".\")[0]):\n", - " display(df.to_frame())" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Typologies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "structures_df = all_places_df[all_places_df[\"position.departement\"] == \"Essonne\"]\n", - "\n", - "categories_flags_structures_df = (\n", - " structures_df.name.str.lower()\n", - " .apply(\n", - " lambda s: {\n", - " \"restos_du_c\": \"restos du c\" in s,\n", - " \"epicerie\": \"epicerie\" in s,\n", - " \"caf\": \"caf\" in s,\n", - " \"ccas\": \"ccas\" in s,\n", - " \"pole_emploi\": \"pôle emploi\" in s,\n", - " \"secours_populaire\": \"secours populaire\" in s,\n", - " \"secours_catholique\": \"secours catholique\" in s,\n", - " \"mairie\": \"mairie\" in s,\n", - " \"commune\": \"commune\" in s,\n", - " \"association\": \"association\" in s,\n", - " \"mission_locale\": \"mission locale\" in s,\n", - " \"action_emploi\": \"action emploi\" in s,\n", - " \"caarud\": \"caarud\" in s,\n", - " \"croix_rouge\": \"croix\" in s and \"rouge\" in s,\n", - " \"pmi\": \"pmi\" in s,\n", - " \"*thèque\": \"médiathèque\" in s or \"bibliothèque\" in s,\n", - " \"mjd\": \"mjd\" in s,\n", - " \"france_services\": \"espace france services\" in s,\n", - " \"cidff\": \"cidff\" in s,\n", - " \"médiavipp\": \"médiavipp\" in s,\n", - " \"nouvelles_voies\": \"nouvelles voies\" in s,\n", - " \"adil\": \"adil\" in s,\n", - " \"maison_emploi\": \"maison\" in s and \"emploi\" in s,\n", - " },\n", - " )\n", - " .apply(pd.Series)\n", - " .assign(\n", - " na=lambda df: df.apply(\n", - " lambda row: ~row.any(), axis=\"columns\", result_type=\"expand\"\n", - " )\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "categories_flags_structures_df.sum().sort_values(ascending=False).plot(\n", - " kind=\"bar\", grid=True, rot=45, figsize=(20, 8)\n", - ")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4 (main, Apr 24 2022, 15:44:04) [GCC 11.2.0]" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/soliguide/analyse_2023.ipynb b/analyse/notebooks/soliguide/analyse_2023.ipynb deleted file mode 100644 index 40995118..00000000 --- a/analyse/notebooks/soliguide/analyse_2023.ipynb +++ /dev/null @@ -1,1148 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "349cd1a6", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import dotenv\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "dotenv.load_dotenv(dotenv.find_dotenv())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0da5e5c5", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -e ../../../pipeline\n", - "\n", - "from data_inclusion.scripts.tasks import soliguide\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8837967c", - "metadata": {}, - "outputs": [], - "source": [ - "client = soliguide.APIClient(\n", - " base_url=\"https://api.soliguide.fr/\",\n", - " token=os.environ[\"SOLIGUIDE_API_TOKEN\"],\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dbfef573", - "metadata": {}, - "outputs": [], - "source": [ - "data = client.search(location_geo_type=\"pays\", location_geo_value=\"france\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4658d84", - "metadata": {}, - "outputs": [], - "source": [ - "places_df = pd.json_normalize(data)\n", - "\n", - "# services_df référence places_df via `lieu_id`\n", - "services_df = pd.json_normalize(\n", - " data,\n", - " record_path=\"services_all\",\n", - " meta=[\"lieu_id\"],\n", - ")\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "1231acc9", - "metadata": {}, - "source": [ - "# Structures" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ad676b57", - "metadata": {}, - "source": [ - "### id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29663447", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4d82ed7c", - "metadata": {}, - "source": [ - "### siret" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0078db9", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "17e719f7", - "metadata": {}, - "source": [ - "### rna" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad0d9fb0", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "45027815", - "metadata": {}, - "source": [ - "### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "879b8001", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b3afc242", - "metadata": {}, - "source": [ - "### commune" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78954f23", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4487efb5", - "metadata": {}, - "source": [ - "### code_postal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e8ef05e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "7d891bc9", - "metadata": {}, - "source": [ - "### code_insee" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ec21785", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "e059679e", - "metadata": {}, - "source": [ - "### adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b7f9116", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b1485e71", - "metadata": {}, - "source": [ - "### complement_adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "281ba836", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "053df703", - "metadata": {}, - "source": [ - "### longitude" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01b6a293", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f60c633d", - "metadata": {}, - "source": [ - "### latitude" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88874b83", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "93224075", - "metadata": {}, - "source": [ - "### typologie" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fd08ae88", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ab100d55", - "metadata": {}, - "source": [ - "### telephone" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e6e7a7b", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "dbd0baf6", - "metadata": {}, - "source": [ - "### courriel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "292ecc06", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "357b274a", - "metadata": {}, - "source": [ - "### site_web" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7eec32c", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5f622a80", - "metadata": {}, - "source": [ - "### presentation_resume" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5954a51", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "6ba291fc", - "metadata": {}, - "source": [ - "### presentation_detail" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d08d5dbc", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "a17c551a", - "metadata": {}, - "source": [ - "### source" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59ee9e7c", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "167fb885", - "metadata": {}, - "source": [ - "### date_maj" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "39aa34b5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b6dbdd42", - "metadata": {}, - "source": [ - "### antenne" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5255608", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "efc9232c", - "metadata": {}, - "source": [ - "### lien_source" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16422b67", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "0758e60e", - "metadata": {}, - "source": [ - "### horaires_ouverture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1b42eb5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "14c3c508", - "metadata": {}, - "source": [ - "### accessibilite" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4f8d8f7", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5484ca3f", - "metadata": {}, - "source": [ - "### labels_nationaux" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf995c20", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "bdf4b51b", - "metadata": {}, - "source": [ - "### labels_autres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31588016", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "7a93c501", - "metadata": {}, - "source": [ - "### thematiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b0f57cd", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c6912fec", - "metadata": {}, - "source": [ - "# Services" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4d203fab", - "metadata": {}, - "source": [ - "### id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c7ee1cf", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "83294e17", - "metadata": {}, - "source": [ - "### structure_id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24e651da", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "84c93acc", - "metadata": {}, - "source": [ - "### source" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45a564e4", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3b790afb", - "metadata": {}, - "source": [ - "### nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e9e621e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "1c01f09a", - "metadata": {}, - "source": [ - "### presentation_resume" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93d1c7d8", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "6146afe6", - "metadata": {}, - "source": [ - "### presentation_detail" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "806dde2e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ce514ab4", - "metadata": {}, - "source": [ - "### types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5e5ec115", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "9eb0a331", - "metadata": {}, - "source": [ - "### thematiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "161f5c20", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8c249d2f", - "metadata": {}, - "source": [ - "### prise_rdv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7aaa2394", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8e27b79e", - "metadata": {}, - "source": [ - "### frais" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8bf47c3d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ddd1fb66", - "metadata": {}, - "source": [ - "### frais_autres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "795d6b37", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f4cf1345", - "metadata": {}, - "source": [ - "### profils" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dc49371", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "1255dd9d", - "metadata": {}, - "source": [ - "### pre_requis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab8d182a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "05d23459", - "metadata": {}, - "source": [ - "### cumulable" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "345ff262", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8a85c67f", - "metadata": {}, - "source": [ - "### justificatifs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00227de5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "0a574738", - "metadata": {}, - "source": [ - "### formulaire_en_ligne" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9b7f206", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b1794b72", - "metadata": {}, - "source": [ - "### commune" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15c12f01", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "dd0a49a2", - "metadata": {}, - "source": [ - "### code_postal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4907513c", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "57abdce1", - "metadata": {}, - "source": [ - "### code_insee" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aecc9537", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b0fcdb2b", - "metadata": {}, - "source": [ - "### adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6723594", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "32a35bc2", - "metadata": {}, - "source": [ - "### complement_adresse" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02026e94", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ff3de81c", - "metadata": {}, - "source": [ - "### longitude" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1bc258fa", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "1dee2805", - "metadata": {}, - "source": [ - "### latitude" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f230af1", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "99368eba", - "metadata": {}, - "source": [ - "### recurrence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3111b7b0", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ac3f1dba", - "metadata": {}, - "source": [ - "### date_creation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2604ce3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "bfe85521", - "metadata": {}, - "source": [ - "### date_suspension" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae0ddac2", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "380b6ed9", - "metadata": {}, - "source": [ - "### lien_source" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4646843", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3e6b2a75", - "metadata": {}, - "source": [ - "### telephone" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f4cde1e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "56572d7d", - "metadata": {}, - "source": [ - "### courriel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "479e9c47", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "167fe689", - "metadata": {}, - "source": [ - "### contact_public" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ae52297", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3e2d4c55", - "metadata": {}, - "source": [ - "### date_maj" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "577ee35a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "1b4feeb8", - "metadata": {}, - "source": [ - "### modes_accueil" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d7649c1", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "6bcfa2c7", - "metadata": {}, - "source": [ - "### zone_diffusion_type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa43b661", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "bd815c1f", - "metadata": {}, - "source": [ - "### zone_diffusion_code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16e8bbf5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "76d72149", - "metadata": {}, - "source": [ - "### zone_diffusion_nom" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4f8de89", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyse/notebooks/soliguide/croisement_siae_emploi.ipynb b/analyse/notebooks/soliguide/croisement_siae_emploi.ipynb deleted file mode 100644 index 1af73fb6..00000000 --- a/analyse/notebooks/soliguide/croisement_siae_emploi.ipynb +++ /dev/null @@ -1,370 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import dotenv\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import os\n", - "import pandas as pd\n", - "import dotenv\n", - "from tqdm.auto import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dotenv.load_dotenv(dotenv.find_dotenv())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "plt.rc(\"figure\", figsize=[12, 8])\n", - "tqdm.pandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[lien](https://apisolidarite.soliguide.fr/Documentation-technique-de-l-API-Solidarit-ecaf8198f0e9400d93140b8043c9f2ce) vers la documentation api soliguide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "soliguide_raw_df = pd.read_json(\"soliguide-places-20220721.json\")\n", - "soliguide_raw_df = soliguide_raw_df.replace([\"\", np.nan], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "soliguide_services_df = pd.json_normalize(\n", - " soliguide_raw_df.to_dict(orient=\"records\"),\n", - " record_path=\"services_all\",\n", - " meta=\"lieu_id\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Récupération des siaes en passant les services catégorisés avec le code 205 (\"Insertion par l'activité économique\")\n", - "soliguide_siaes_df = soliguide_raw_df[\n", - " soliguide_raw_df.lieu_id.isin(\n", - " soliguide_services_df[soliguide_services_df.categorie == 205].lieu_id.to_list()\n", - " )\n", - "]\n", - "\n", - "soliguide_siaes_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks.sources import soliguide\n", - "\n", - "soliguide_siaes_df = soliguide.transform_dataframe(soliguide_siaes_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks import geocoding\n", - "\n", - "soliguide_siaes_df = geocoding.geocode_normalized_dataframe(\n", - " soliguide_siaes_df,\n", - " geocoding.BaseAdresseNationaleBackend(base_url=\"https://api-adresse.data.gouv.fr/\"),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Siretisation automatique des données SIAEs Soliguide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks.siretisation import siretisation\n", - "\n", - "soliguide_siaes_df = siretisation.siretize_normalized_dataframe(\n", - " soliguide_siaes_df,\n", - " os.environ[\"SIRENE_DATABASE_URL\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "soliguide_siaes_df.siret.notna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "soliguide_siaes_df= soliguide_siaes_df[\n", - " ~soliguide_siaes_df.siret.isin(\n", - " [\n", - " \"49929393400018\",\n", - " \"79493735900014\",\n", - " \"49825896100024\",\n", - " \"82068992500015\",\n", - " \"39900733500017\",\n", - " \"33958232200014\",\n", - " \"85154273800014\",\n", - " \"90840665500013\",\n", - " ]\n", - " )\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### chargement des données siaes des emplois de l'inclusion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks.sources import itou\n", - "\n", - "client = itou.ItouClient(\n", - " url=\"https://emplois.inclusion.beta.gouv.fr/api/v1/structures/?type=siae\",\n", - " token=os.environ[\"ITOU_API_TOKEN\"],\n", - ")\n", - "\n", - "itou_siaes_df = pd.DataFrame(client.list_structures()).replace([np.nan, \"\"], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks import geocoding\n", - "\n", - "itou_siaes_df = geocoding.geocode_normalized_dataframe(\n", - " itou_siaes_df,\n", - " geocoding.BaseAdresseNationaleBackend(base_url=\"https://api-adresse.data.gouv.fr/\"),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "itou_siaes_df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### croisement partiel\n", - "\n", - "* sur la base des données siretisées automatiquement" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_df = pd.merge(\n", - " soliguide_siaes_df.add_prefix(\"soliguide_\"),\n", - " itou_siaes_df.drop_duplicates([\"siret\"]).add_prefix(\"itou_\"),\n", - " left_on=\"soliguide_siret\",\n", - " right_on=\"itou_siret\",\n", - ")\n", - "\n", - "shared_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_df[[\"soliguide_siret\", \"soliguide_id\", \"itou_id\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_df[[\"soliguide_siret\", \"soliguide_date_maj\", \"itou_date_maj\"]].assign(\n", - " latest=shared_df.apply(\n", - " lambda row: \"soliguide\"\n", - " if row[\"soliguide_date_maj\"] > row[\"itou_date_maj\"]\n", - " else \"itou\",\n", - " axis=1,\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Departements communs ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_df.soliguide_code_insee.str[:2].value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### les SIAEs dans les emplois sur les territoires de Soliguide\n", - "\n", - "* Territoires de Soliguide : départements où soliguide a au moins 50 structures\n", - "* on exclut les SIAEs déjà dans les données de Soliguide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks.sources import soliguide\n", - "from data_inclusion.scripts.tasks import geocoding\n", - "\n", - "soliguide_structures_df = soliguide.transform_dataframe(soliguide_raw_df)\n", - "soliguide_structures_df = geocoding.geocode_normalized_dataframe(\n", - " soliguide_structures_df,\n", - " geocoding.BaseAdresseNationaleBackend(base_url=\"https://api-adresse.data.gouv.fr/\"),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "count_by_cog_dept = soliguide_structures_df.code_insee.str[:2].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de siaes des emplois de l'inclusion sur les territoires de soliguide ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "itou_siaes_of_interest_df = itou_siaes_df[itou_siaes_df.code_insee.str[:2].isin(count_by_cog_dept[count_by_cog_dept > 50].index.to_list())]\n", - "itou_siaes_of_interest_df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de siaes des emplois de l'inclusion inconnues par soliguide sur ses territoires ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "itou_siaes_of_interest_df = itou_siaes_of_interest_df[~itou_siaes_of_interest_df.siret.isin(soliguide_siaes_df.siret)]\n", - "itou_siaes_of_interest_df.shape[0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.4 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4 (main, Apr 24 2022, 15:44:04) [GCC 11.2.0]" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/soliguide/siae.ipynb b/analyse/notebooks/soliguide/siae.ipynb deleted file mode 100644 index 73933f4f..00000000 --- a/analyse/notebooks/soliguide/siae.ipynb +++ /dev/null @@ -1,403 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import dotenv\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import os\n", - "import pandas as pd\n", - "import dotenv\n", - "from tqdm.auto import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dotenv.load_dotenv(dotenv.find_dotenv())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[lien](https://apisolidarite.soliguide.fr/Documentation-technique-de-l-API-Solidarit-ecaf8198f0e9400d93140b8043c9f2ce) vers la documentation api soliguide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_rows = None\n", - "plt.rc(\"figure\", figsize=[12, 8])\n", - "tqdm.pandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### chargement des données soliguide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "soliguide_raw_df = pd.read_json(\"soliguide-places-20220721.json\")\n", - "soliguide_raw_df = soliguide_raw_df.replace([\"\", np.nan], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "soliguide_services_df = pd.json_normalize(\n", - " soliguide_raw_df.to_dict(orient=\"records\"),\n", - " record_path=\"services_all\",\n", - " meta=\"lieu_id\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Trouver les SIAE correspondants à ces SIRET" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Récupération des siaes en passant les services catégorisés avec le code 205 (\"Insertion par l'activité économique\")\n", - "soliguide_siaes_df = soliguide_raw_df[\n", - " soliguide_raw_df.lieu_id.isin(\n", - " soliguide_services_df[soliguide_services_df.categorie == 205].lieu_id.to_list()\n", - " )\n", - "]\n", - "\n", - "soliguide_siaes_df.shape[0]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks.sources import soliguide\n", - "\n", - "soliguide_siaes_df = soliguide.transform_dataframe(soliguide_siaes_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks import geocoding\n", - "\n", - "soliguide_siaes_df = geocoding.geocode_normalized_dataframe(\n", - " soliguide_siaes_df,\n", - " geocoding.BaseAdresseNationaleBackend(base_url=\"https://api-adresse.data.gouv.fr/\"),\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Trouver les SIRET automatique pour les données Soliguide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks.siretisation import siretisation\n", - "\n", - "soliguide_siaes_df = siretisation.siretize_normalized_dataframe(\n", - " soliguide_siaes_df,\n", - " os.environ[\"SIRENE_DATABASE_URL\"],\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "soliguide_siaes_df.siret.notna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "8 erreurs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# exclude errors\n", - "soliguide_siaes_df= soliguide_siaes_df[\n", - " ~soliguide_siaes_df.siret.isin(\n", - " [\n", - " \"49929393400018\",\n", - " \"79493735900014\",\n", - " \"49825896100024\",\n", - " \"82068992500015\",\n", - " \"39900733500017\",\n", - " \"33958232200014\",\n", - " \"85154273800014\",\n", - " \"90840665500013\",\n", - " ]\n", - " )\n", - "]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### chargement des données siaes des emplois de l'inclusion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks.sources import itou\n", - "\n", - "client = itou.ItouClient(\n", - " url=\"https://emplois.inclusion.beta.gouv.fr/api/v1/structures/?type=siae\",\n", - " token=os.environ[\"ITOU_API_TOKEN\"],\n", - ")\n", - "\n", - "itou_siaes_df = pd.DataFrame(client.list_structures()).replace([np.nan, \"\"], None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks import geocoding\n", - "\n", - "itou_siaes_df = geocoding.geocode_normalized_dataframe(\n", - " itou_siaes_df,\n", - " geocoding.BaseAdresseNationaleBackend(base_url=\"https://api-adresse.data.gouv.fr/\"),\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "itou_siaes_df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. croisement emplois de l'inclusion / soliguide sur la base des sirets retrouvés automatiquement" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_df = pd.merge(\n", - " soliguide_siaes_df.add_prefix(\"soliguide_\"),\n", - " itou_siaes_df.drop_duplicates([\"siret\"]).add_prefix(\"itou_\"),\n", - " left_on=\"soliguide_siret\",\n", - " right_on=\"itou_siret\",\n", - ")\n", - "\n", - "shared_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_df[[\"soliguide_siret\", \"soliguide_id\", \"itou_id\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_df[[\"soliguide_siret\", \"soliguide_date_maj\", \"itou_date_maj\"]].assign(\n", - " latest=shared_df.apply(\n", - " lambda row: \"soliguide\"\n", - " if row[\"soliguide_date_maj\"] > row[\"itou_date_maj\"]\n", - " else \"itou\",\n", - " axis=1,\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Departements communs ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_df.soliguide_code_insee.str[:2].value_counts().to_frame()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Trouver tous les SIAE dans les emplois sur les territoires de Soliguide" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Territoires de Soliguide : départements où soliguide a au moins 50 structures\n", - "* on exclut les SIAEs déjà dans les données de Soliguide" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from data_inclusion.scripts.tasks.sources import soliguide\n", - "from data_inclusion.scripts.tasks import geocoding\n", - "\n", - "soliguide_structures_df = soliguide.transform_dataframe(soliguide_raw_df)\n", - "soliguide_structures_df = geocoding.geocode_normalized_dataframe(\n", - " soliguide_structures_df,\n", - " geocoding.BaseAdresseNationaleBackend(base_url=\"https://api-adresse.data.gouv.fr/\"),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "count_by_cog_dept = soliguide_structures_df.code_insee.str[:2].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de siaes des emplois de l'inclusion sur les territoires de soliguide ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "itou_siaes_of_interest_df = itou_siaes_df[itou_siaes_df.code_insee.str[:2].isin(count_by_cog_dept[count_by_cog_dept > 50].index.to_list())]\n", - "itou_siaes_of_interest_df.shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "nb de siaes des emplois de l'inclusion inconnues par soliguide sur ses territoires ?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "itou_siaes_of_interest_df = itou_siaes_of_interest_df[~itou_siaes_of_interest_df.siret.isin(soliguide_siaes_df.siret)]\n", - "itou_siaes_of_interest_df.shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "soliguide_siaes_df.shape[0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.4 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/notebooks/template.ipynb b/analyse/notebooks/template.ipynb deleted file mode 100644 index fa75db42..00000000 --- a/analyse/notebooks/template.ipynb +++ /dev/null @@ -1,136 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook can be used to generate a notebook for data analysis." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "import nbformat" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "notebook = nbformat.v4.new_notebook()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -e ../../../data-inclusion-schema\n", - "\n", - "from data_inclusion.schema.models import Structure, Service\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TARGET_PATH = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "notebook[\"cells\"] = []\n", - "\n", - "notebook[\"cells\"] += [\n", - " nbformat.v4.new_code_cell(\n", - " \"\"\"\\\n", - "import os\n", - "\n", - "import dotenv\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "dotenv.load_dotenv(dotenv.find_dotenv())\n", - "\"\"\"\n", - " ),\n", - " nbformat.v4.new_code_cell(\n", - " \"\"\"\\\n", - "# raw_df = ...\n", - " \"\"\"\n", - " ),\n", - "]\n", - "notebook[\"cells\"] += [nbformat.v4.new_markdown_cell(\"# Structures\")]\n", - "notebook[\"cells\"] += [\n", - " cell\n", - " for field_name in Structure.__fields__\n", - " for cell in [\n", - " nbformat.v4.new_markdown_cell(f\"### {field_name}\"),\n", - " nbformat.v4.new_code_cell(),\n", - " ]\n", - "]\n", - "notebook[\"cells\"] += [nbformat.v4.new_markdown_cell(\"# Services\")]\n", - "notebook[\"cells\"] += [\n", - " cell\n", - " for field_name in Service.__fields__\n", - " for cell in [\n", - " nbformat.v4.new_markdown_cell(f\"### {field_name}\"),\n", - " nbformat.v4.new_code_cell(),\n", - " ]\n", - "]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target_path = Path(TARGET_PATH)\n", - "target_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", - "nbformat.write(notebook, target_path)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.4 ('.venv': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - }, - "vscode": { - "interpreter": { - "hash": "5c59c3774541e2228ee548c093b471ded1573b3beb617fa2a9d607b090635324" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyse/pyproject.toml b/analyse/pyproject.toml index ffb6f1e6..08f3bf7f 100644 --- a/analyse/pyproject.toml +++ b/analyse/pyproject.toml @@ -2,20 +2,23 @@ build-backend = "setuptools.build_meta" requires = ["setuptools", "wheel"] -[tool.isort] -atomic = true -combine_star = true -ensure_newline_before_comments = true -force_grid_wrap = 0 -include_trailing_comma = true -indent = 4 -known_first_party = "data_inclusion" -line_length = 88 -multi_line_output = 3 -profile = "black" -sections = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" -skip_gitignore = true -use_parentheses = true +[project] +name = "data-inclusion-analyse" +dependencies = [ + "notebook~=7.2", +] -[tool.black] -target-version = ["py311"] +[tool.ruff.lint] +# see prefixes in https://beta.ruff.rs/docs/rules/ +select = [ + "F", # pyflakes + "E", # pycodestyle errors + "W", # pycodestyle warnings + "I", # isort + "UP", # pyupgrade +] + +[tool.ruff.lint.isort] +section-order = ["future","standard-library","third-party","first-party","local-folder"] +combine-as-imports = true +known-first-party = ["data_inclusion"] diff --git a/analyse/requirements/requirements.txt b/analyse/requirements/requirements.txt new file mode 100644 index 00000000..a88164f2 --- /dev/null +++ b/analyse/requirements/requirements.txt @@ -0,0 +1,279 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml --output-file=requirements/requirements.txt +anyio==4.4.0 + # via + # httpx + # jupyter-server +argon2-cffi==23.1.0 + # via jupyter-server +argon2-cffi-bindings==21.2.0 + # via argon2-cffi +arrow==1.3.0 + # via isoduration +asttokens==2.4.1 + # via stack-data +async-lru==2.0.4 + # via jupyterlab +attrs==24.2.0 + # via + # jsonschema + # referencing +babel==2.16.0 + # via jupyterlab-server +beautifulsoup4==4.12.3 + # via nbconvert +bleach==6.1.0 + # via nbconvert +certifi==2024.7.4 + # via + # httpcore + # httpx + # requests +cffi==1.17.0 + # via argon2-cffi-bindings +charset-normalizer==3.3.2 + # via requests +comm==0.2.2 + # via ipykernel +debugpy==1.8.5 + # via ipykernel +decorator==5.1.1 + # via ipython +defusedxml==0.7.1 + # via nbconvert +executing==2.0.1 + # via stack-data +fastjsonschema==2.20.0 + # via nbformat +fqdn==1.5.1 + # via jsonschema +h11==0.14.0 + # via httpcore +httpcore==1.0.5 + # via httpx +httpx==0.27.0 + # via jupyterlab +idna==3.7 + # via + # anyio + # httpx + # jsonschema + # requests +ipykernel==6.29.5 + # via jupyterlab +ipython==8.26.0 + # via ipykernel +isoduration==20.11.0 + # via jsonschema +jedi==0.19.1 + # via ipython +jinja2==3.1.4 + # via + # jupyter-server + # jupyterlab + # jupyterlab-server + # nbconvert +json5==0.9.25 + # via jupyterlab-server +jsonpointer==3.0.0 + # via jsonschema +jsonschema==4.23.0 + # via + # jupyter-events + # jupyterlab-server + # nbformat +jsonschema-specifications==2023.12.1 + # via jsonschema +jupyter-client==8.6.2 + # via + # ipykernel + # jupyter-server + # nbclient +jupyter-core==5.7.2 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterlab + # nbclient + # nbconvert + # nbformat +jupyter-events==0.10.0 + # via jupyter-server +jupyter-lsp==2.2.5 + # via jupyterlab +jupyter-server==2.14.2 + # via + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # notebook + # notebook-shim +jupyter-server-terminals==0.5.3 + # via jupyter-server +jupyterlab==4.2.4 + # via notebook +jupyterlab-pygments==0.3.0 + # via nbconvert +jupyterlab-server==2.27.3 + # via + # jupyterlab + # notebook +markupsafe==2.1.5 + # via + # jinja2 + # nbconvert +matplotlib-inline==0.1.7 + # via + # ipykernel + # ipython +mistune==3.0.2 + # via nbconvert +nbclient==0.10.0 + # via nbconvert +nbconvert==7.16.4 + # via jupyter-server +nbformat==5.10.4 + # via + # jupyter-server + # nbclient + # nbconvert +nest-asyncio==1.6.0 + # via ipykernel +notebook==7.2.1 + # via data-inclusion-analyse (pyproject.toml) +notebook-shim==0.2.4 + # via + # jupyterlab + # notebook +overrides==7.7.0 + # via jupyter-server +packaging==24.1 + # via + # ipykernel + # jupyter-server + # jupyterlab + # jupyterlab-server + # nbconvert +pandocfilters==1.5.1 + # via nbconvert +parso==0.8.4 + # via jedi +pexpect==4.9.0 + # via ipython +platformdirs==4.2.2 + # via jupyter-core +prometheus-client==0.20.0 + # via jupyter-server +prompt-toolkit==3.0.47 + # via ipython +psutil==6.0.0 + # via ipykernel +ptyprocess==0.7.0 + # via + # pexpect + # terminado +pure-eval==0.2.3 + # via stack-data +pycparser==2.22 + # via cffi +pygments==2.18.0 + # via + # ipython + # nbconvert +python-dateutil==2.9.0.post0 + # via + # arrow + # jupyter-client +python-json-logger==2.0.7 + # via jupyter-events +pyyaml==6.0.2 + # via jupyter-events +pyzmq==26.1.1 + # via + # ipykernel + # jupyter-client + # jupyter-server +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications + # jupyter-events +requests==2.32.3 + # via jupyterlab-server +rfc3339-validator==0.1.4 + # via + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 + # via + # jsonschema + # jupyter-events +rpds-py==0.20.0 + # via + # jsonschema + # referencing +send2trash==1.8.3 + # via jupyter-server +setuptools==72.2.0 + # via jupyterlab +six==1.16.0 + # via + # asttokens + # bleach + # python-dateutil + # rfc3339-validator +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +stack-data==0.6.3 + # via ipython +terminado==0.18.1 + # via + # jupyter-server + # jupyter-server-terminals +tinycss2==1.3.0 + # via nbconvert +tornado==6.4.1 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterlab + # notebook + # terminado +traitlets==5.14.3 + # via + # comm + # ipykernel + # ipython + # jupyter-client + # jupyter-core + # jupyter-events + # jupyter-server + # jupyterlab + # matplotlib-inline + # nbclient + # nbconvert + # nbformat +types-python-dateutil==2.9.0.20240316 + # via arrow +typing-extensions==4.12.2 + # via ipython +uri-template==1.3.0 + # via jsonschema +urllib3==2.2.2 + # via requests +wcwidth==0.2.13 + # via prompt-toolkit +webcolors==24.8.0 + # via jsonschema +webencodings==0.5.1 + # via + # bleach + # tinycss2 +websocket-client==1.8.0 + # via jupyter-server