diff --git a/deduplication/main.ipynb b/deduplication/main.ipynb index 6b655352..b8334565 100644 --- a/deduplication/main.ipynb +++ b/deduplication/main.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -41,7 +41,8 @@ "from dedupe._typing import TrainingData\n", "from sklearn import metrics\n", "from sklearn.model_selection import train_test_split\n", - "from unidecode import unidecode" + "from unidecode import unidecode\n", + "from IPython.display import clear_output" ] }, { @@ -361,718 +362,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : norauto lievin\n", - "commune : Liévin\n", - "adresse : Centre commercial Carrefour, rue Bernard Chochoy, ZAL N°1 \n", - "location : [50.4228499, 2.773164]\n", - "code_postal : 62800\n", - "code_insee : 62510\n", - "siren : 480470152\n", - "siret : 48047015202984\n", - "telephone : None\n", - "courriel : contact@manaara.fr\n", - "source : mes-aides\n", - "date_maj : 02/14/2023\n", - "\n", - "nom : norauto lens 2\n", - "commune : Vendin-le-Vieil\n", - "adresse : Centre Commercial Lens 2\n", - "location : [50.459469, 2.828761]\n", - "code_postal : 62880\n", - "code_insee : 62842\n", - "siren : 480470152\n", - "siret : 48047015202984\n", - "telephone : None\n", - "courriel : contact@manaara.fr\n", - "source : mes-aides\n", - "date_maj : 02/14/2023\n", - "\n", - "70/10 positive, 117/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished\n", - "nom : france services herault mediterranee - agde qpv\n", - "commune : Agde\n", - "adresse : 36 Rue Jean Jacques Rousseau\n", - "location : [43.311225, 3.471006]\n", - "code_postal : 34300\n", - "code_insee : 34003\n", - "siren : None\n", - "siret : None\n", - "telephone : 04 67 01 07 93\n", - "courriel : fs.itinerante@agglohm.net\n", - "source : mediation-numerique\n", - "date_maj : 07/01/2022\n", - "\n", - "nom : france services herault mediterranee itinerante - agde centre\n", - "commune : Agde\n", - "adresse : 36 Rue Jean Jacques Rousseau\n", - "location : [43.311225, 3.471006]\n", - "code_postal : 34300\n", - "code_insee : 34003\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : fs.itinerante@agglohm.net\n", - "source : soliguide\n", - "date_maj : 07/08/2024\n", - "\n", - "70/10 positive, 118/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : ccas saint-jean-de-luz\n", - "commune : Saint-Jean-de-Luz\n", - "adresse : 1 Rue Augustin Chaho\n", - "location : [43.38716, -1.660637]\n", - "code_postal : 64500\n", - "code_insee : 64483\n", - "siren : 266404623\n", - "siret : 26640462300032\n", - "telephone : None\n", - "courriel : None\n", - "source : mediation-numerique\n", - "date_maj : 01/01/1970\n", - "\n", - "nom : centre communal d'action sociale\n", - "commune : Saint-Jean-de-Luz\n", - "adresse : 1 Rue Augustin Chaho\n", - "location : [43.38716, -1.660637]\n", - "code_postal : 64500\n", - "code_insee : 64483\n", - "siren : 266404623\n", - "siret : 26640462300032\n", - "telephone : None\n", - "courriel : None\n", - "source : emplois-de-linclusion\n", - "date_maj : 08/18/2021\n", - "\n", - "71/10 positive, 118/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : mosaica centre social\n", - "commune : Saint-Michel-de-Maurienne\n", - "adresse : 36 Rue du général ferrié\n", - "location : [45.217163, 6.472516]\n", - "code_postal : 73140\n", - "code_insee : 73261\n", - "siren : 325931640\n", - "siret : 32593164000065\n", - "telephone : 04 79 56 66 09\n", - "courriel : contact@centresocialmosaica.fr\n", - "source : mediation-numerique\n", - "date_maj : 09/24/2024\n", - "\n", - "nom : france services saint michel de maurienne\n", - "commune : Saint-Michel-de-Maurienne\n", - "adresse : 36 Rue du général ferrié\n", - "location : [45.217163, 6.472516]\n", - "code_postal : 73140\n", - "code_insee : 73261\n", - "siren : None\n", - "siret : None\n", - "telephone : 04 79 56 66 09\n", - "courriel : contact@centresocialmosaica.fr\n", - "source : mediation-numerique\n", - "date_maj : 01/01/2020\n", - "\n", - "72/10 positive, 118/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : pie de villepinte\n", - "commune : Villepinte\n", - "adresse : 5 Rue Pierre Audat\n", - "location : [48.960173, 2.54637]\n", - "code_postal : 93420\n", - "code_insee : 93078\n", - "siren : 219300787\n", - "siret : 21930078700013\n", - "telephone : 01 41 52 53 38\n", - "courriel : service-emploi@ville-villepinte.fr\n", - "source : emplois-de-linclusion\n", - "date_maj : 08/18/2021\n", - "\n", - "nom : commune de sevran\n", - "commune : Sevran\n", - "adresse : None\n", - "location : [48.93797, 2.531288]\n", - "code_postal : 93270\n", - "code_insee : 93071\n", - "siren : 219300712\n", - "siret : 21930071200524\n", - "telephone : 01 41 52 44 71\n", - "courriel : pij@ville-sevran.fr\n", - "source : mes-aides\n", - "date_maj : 08/03/2024\n", - "\n", - "73/10 positive, 118/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : la sasson\n", - "commune : Chambéry\n", - "adresse : None\n", - "location : [45.583223, 5.909299]\n", - "code_postal : 73000\n", - "code_insee : 73065\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : direction@la-sasson.com\n", - "source : soliguide\n", - "date_maj : 11/19/2024\n", - "\n", - "nom : udaf union dep assoc familiale\n", - "commune : Chambéry\n", - "adresse : 28 Place du forum\n", - "location : [45.592493, 5.91937]\n", - "code_postal : 73000\n", - "code_insee : 73065\n", - "siren : 776467086\n", - "siret : 77646708600042\n", - "telephone : None\n", - "courriel : None\n", - "source : emplois-de-linclusion\n", - "date_maj : 04/27/2023\n", - "\n", - "73/10 positive, 119/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : communaute de communes val'aigo\n", - "commune : Villemur-sur-Tarn\n", - "adresse : 2 Avenue Saint-exupéry\n", - "location : [43.860995, 1.500364]\n", - "code_postal : 31340\n", - "code_insee : 31584\n", - "siren : 243100773\n", - "siret : 24310077300064\n", - "telephone : None\n", - "courriel : None\n", - "source : emplois-de-linclusion\n", - "date_maj : 08/18/2021\n", - "\n", - "nom : france services val'aigo - villemur-sur-tarn\n", - "commune : Villemur-sur-Tarn\n", - "adresse : 2 Avenue Saint-exupéry\n", - "location : [43.860995, 1.500364]\n", - "code_postal : 31340\n", - "code_insee : 31584\n", - "siren : None\n", - "siret : None\n", - "telephone : 05 61 09 91 38\n", - "courriel : contact@valaigo.fr\n", - "source : mediation-numerique\n", - "date_maj : 01/01/2020\n", - "\n", - "73/10 positive, 120/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : cdafal 68\n", - "commune : Mulhouse\n", - "adresse : 3 Rue Georges Risler\n", - "location : [47.761128, 7.33988]\n", - "code_postal : 68100\n", - "code_insee : 68224\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : cdafal68@hotmail.com\n", - "source : mediation-numerique\n", - "date_maj : 08/20/2024\n", - "\n", - "nom : france travail (ex-pole emploi) - agence mulhouse drouot\n", - "commune : Mulhouse\n", - "adresse : 40 Rue du 57ème Régiment de Transmissions\n", - "location : [47.762153, 7.358047]\n", - "code_postal : 68100\n", - "code_insee : 68224\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : None\n", - "source : soliguide\n", - "date_maj : 07/16/2024\n", - "\n", - "73/10 positive, 120/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : centre de soins, d'accompagnement et de prevention en addictologie (csapa) - le sept - aubagne\n", - "commune : Aubagne\n", - "adresse : 7 Avenue Joseph Fallen\n", - "location : [43.293355, 5.565435]\n", - "code_postal : 13400\n", - "code_insee : 13005\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : lesept@ad-med.fr\n", - "source : soliguide\n", - "date_maj : 07/15/2024\n", - "\n", - "nom : csapa le sept\n", - "commune : Aubagne\n", - "adresse : 7 Avenue Joseph Fallen\n", - "location : [43.293355, 5.565435]\n", - "code_postal : 13400\n", - "code_insee : 13005\n", - "siren : 331365239\n", - "siret : 33136523900093\n", - "telephone : None\n", - "courriel : None\n", - "source : mediation-numerique\n", - "date_maj : 01/01/1970\n", - "\n", - "73/10 positive, 121/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : permanence cidff - massy - maison de l'emploi et de la formation (meif)\n", - "commune : Massy\n", - "adresse : 10 Avenue du Noyer Lambert\n", - "location : [48.732167, 2.292395]\n", - "code_postal : 91300\n", - "code_insee : 91377\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : None\n", - "source : soliguide\n", - "date_maj : 07/03/2024\n", - "\n", - "nom : point accueil ecoute jeunes (paej) - massy\n", - "commune : Massy\n", - "adresse : 10 Avenue du Noyer Lambert\n", - "location : [48.732167, 2.292395]\n", - "code_postal : 91300\n", - "code_insee : 91377\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : paej@apaso.fr\n", - "source : soliguide\n", - "date_maj : 10/22/2024\n", - "\n", - "74/10 positive, 121/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : norauto lens ii\n", - "commune : Vendin-le-Vieil\n", - "adresse : Centre Commercial Lens 2\n", - "location : [50.459469, 2.828761]\n", - "code_postal : 62880\n", - "code_insee : 62842\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : None\n", - "source : soliguide\n", - "date_maj : 07/09/2024\n", - "\n", - "nom : norauto lens 2\n", - "commune : Vendin-le-Vieil\n", - "adresse : Centre Commercial Lens 2\n", - "location : [50.459469, 2.828761]\n", - "code_postal : 62880\n", - "code_insee : 62842\n", - "siren : 480470152\n", - "siret : 48047015202984\n", - "telephone : None\n", - "courriel : contact@manaara.fr\n", - "source : mes-aides\n", - "date_maj : 02/14/2023\n", - "\n", - "74/10 positive, 121/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : e2c charente et poitou - chatellerault\n", - "commune : Châtellerault\n", - "adresse : 209 Grand Rue de Chateauneuf\n", - "location : [46.812846, 0.531864]\n", - "code_postal : 86100\n", - "code_insee : 86066\n", - "siren : 483888988\n", - "siret : 48388898800027\n", - "telephone : 05 49 93 87 79\n", - "courriel : contact@e2c-charentepoitou.fr\n", - "source : dora\n", - "date_maj : 07/01/2024\n", - "\n", - "nom : ecole de la 2e chance - site de chatellerault\n", - "commune : Châtellerault\n", - "adresse : 209 Grand Rue de Chateauneuf\n", - "location : [46.812846, 0.531864]\n", - "code_postal : 86100\n", - "code_insee : 86066\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : fzeni@e2c-charentepoitou.fr\n", - "source : soliguide\n", - "date_maj : 11/08/2024\n", - "\n", - "75/10 positive, 121/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : france travail - rosny-sous-bois\n", - "commune : Rosny-sous-Bois\n", - "adresse : 6 Rue de Rome\n", - "location : [48.875649, 2.475459]\n", - "code_postal : 93110\n", - "code_insee : 93064\n", - "siren : 130005481\n", - "siret : 13000548121057\n", - "telephone : 3949\n", - "courriel : ape.93104@francetravail.fr\n", - "source : emplois-de-linclusion\n", - "date_maj : 06/25/2020\n", - "\n", - "nom : france travail (ex-pole emploi) - rosny-sous-bois\n", - "commune : Rosny-sous-Bois\n", - "adresse : 6 Rue de Rome\n", - "location : [48.875649, 2.475459]\n", - "code_postal : 93110\n", - "code_insee : 93064\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : None\n", - "source : soliguide\n", - "date_maj : 11/15/2024\n", - "\n", - "76/10 positive, 121/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : commune de louvres\n", - "commune : Louvres\n", - "adresse : 84 Rue de Paris\n", - "location : [49.045179, 2.509429]\n", - "code_postal : 95380\n", - "code_insee : 95351\n", - "siren : 219503513\n", - "siret : 21950351300018\n", - "telephone : 01 34 31 31 30\n", - "courriel : pime@ville-louvres.fr\n", - "source : mediation-numerique\n", - "date_maj : 03/09/2023\n", - "\n", - "nom : mairie de louvres\n", - "commune : Louvres\n", - "adresse : None\n", - "location : [49.047348, 2.503928]\n", - "code_postal : 95380\n", - "code_insee : 95351\n", - "siren : 219503513\n", - "siret : 21950351300018\n", - "telephone : None\n", - "courriel : None\n", - "source : mes-aides\n", - "date_maj : 08/11/2024\n", - "\n", - "77/10 positive, 121/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : caritas rouffach\n", - "commune : Rouffach\n", - "adresse : 9 Place de la République\n", - "location : [47.956034, 7.300301]\n", - "code_postal : 68250\n", - "code_insee : 68287\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : None\n", - "source : soliguide\n", - "date_maj : 07/03/2024\n", - "\n", - "nom : restos du coeur - annexe du centre d'activites de soultz\n", - "commune : Rouffach\n", - "adresse : 9 Place de la République\n", - "location : [47.956034, 7.300301]\n", - "code_postal : 68250\n", - "code_insee : 68287\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : ad68.soultz@restosducoeur.org\n", - "source : soliguide\n", - "date_maj : 06/18/2024\n", - "\n", - "78/10 positive, 121/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : france services la poste de puyloubier\n", - "commune : Puyloubier\n", - "adresse : 1 Square Jean Casanova\n", - "location : [43.525255, 5.672983]\n", - "code_postal : 13114\n", - "code_insee : 13079\n", - "siren : None\n", - "siret : None\n", - "telephone : 04 42 16 23 60\n", - "courriel : puyloubier@france-services.gouv.fr\n", - "source : mediation-numerique\n", - "date_maj : 09/01/2020\n", - "\n", - "nom : france services - puyloubier\n", - "commune : Puyloubier\n", - "adresse : 1 Square Jean Casanova\n", - "location : [43.525255, 5.672983]\n", - "code_postal : 13114\n", - "code_insee : 13079\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : puyloubier@france-services.gouv.fr\n", - "source : soliguide\n", - "date_maj : 07/11/2024\n", - "\n", - "78/10 positive, 122/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : france travail (ex-pole emploi) brunoy\n", - "commune : Brunoy\n", - "adresse : 13 Rue de Cerçay\n", - "location : [48.701802, 2.50686]\n", - "code_postal : 91800\n", - "code_insee : 91114\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : None\n", - "source : soliguide\n", - "date_maj : 11/13/2024\n", - "\n", - "nom : pmi brunoy\n", - "commune : Brunoy\n", - "adresse : 130 Rue de Cerçay\n", - "location : [48.708204, 2.524756]\n", - "code_postal : 91800\n", - "code_insee : 91114\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : None\n", - "source : soliguide\n", - "date_maj : 11/13/2024\n", - "\n", - "79/10 positive, 122/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : ccas de morsang sur orge\n", - "commune : Morsang-sur-Orge\n", - "adresse : Square Alexandre Christophe\n", - "location : [48.661764, 2.346635]\n", - "code_postal : 91390\n", - "code_insee : 91434\n", - "siren : 269100830\n", - "siret : 26910083000018\n", - "telephone : None\n", - "courriel : None\n", - "source : mediation-numerique\n", - "date_maj : 01/01/1970\n", - "\n", - "nom : ccas morsang-sur-orge\n", - "commune : Morsang-sur-Orge\n", - "adresse : Square Alexandre Christophe\n", - "location : [48.661764, 2.346635]\n", - "code_postal : 91390\n", - "code_insee : 91434\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : ccas@ville-morsang.fr\n", - "source : soliguide\n", - "date_maj : 09/04/2024\n", - "\n", - "79/10 positive, 123/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", - "nom : france travail - point relais alfortville\n", - "commune : Maisons-Alfort\n", - "adresse : 259 Avenue du Général Leclerc\n", - "location : [48.804171, 2.444209]\n", - "code_postal : 94700\n", - "code_insee : 94046\n", - "siren : None\n", - "siret : None\n", - "telephone : None\n", - "courriel : None\n", - "source : emplois-de-linclusion\n", - "date_maj : 04/13/2022\n", - "\n", - "nom : agence france travail maisons-alfort\n", - "commune : Maisons-Alfort\n", - "adresse : 259 Avenue du Général Leclerc\n", - "location : [48.804171, 2.444209]\n", - "code_postal : 94700\n", - "code_insee : 94046\n", - "siren : 130005481\n", - "siret : 13000548126478\n", - "telephone : 3949\n", - "courriel : None\n", - "source : france-travail\n", - "date_maj : 11/19/2024\n", - "\n", - "80/10 positive, 123/10 negative\n", - "Do these records refer to the same thing?\n", - "(y)es / (n)o / (u)nsure / (f)inished / (p)revious\n", - "Finished labeling\n", - "/home/vmttn/src/github.com/betagouv/data-inclusion/deduplication/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n" - ] - } - ], + "outputs": [], "source": [ "label_deduper = dedupe.Dedupe(\n", " variable_definition=fields,\n", @@ -1111,7 +401,10 @@ " orient=\"records\",\n", " force_ascii=False,\n", " indent=4,\n", - ")\n" + ")\n", + "\n", + "# remove cell output that can contain private data\n", + "clear_output()" ] }, {