__wip: Make the geocoding much more accurate

Before this pass, we had very few reliable high scores (above 0.8) and even then, many of those were more than questionable. For instance: - we could have a 0.97 score for an address (without city info) - we could have a 0.95 score for just a city - only 411 (bad) results found if city or zipcode not present - there were 90741 scores above 0.8 among 136246 (66%) but mostly unreliable Now, we have: - 38637 rows found with score >0.8 if at least city/zipcode/insee is present instead of 411 - there is no single row resolving to a municipality - accuracy of the scores above 0.8 has been manually checked - we still have 66% of the results above the 0.8 score, but "lost" 20_000 records that could be considered garbage. For instance, some adresses containing a number now resolve to a street. After check, it seems that those are adresses (DOM/TOM mainly) where street numbers are unknown for the BAN and usually Google Maps as well.
gip-inclusion · Jul 16, 2024 · 68c08be · 68c08be
1 parent 3def016
commit 68c08be
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 5 deletions.
diff --git a/pipeline/dags/dag_utils/geocoding.py b/pipeline/dags/dag_utils/geocoding.py
@@ -31,7 +31,7 @@ def _geocode(self, df: pd.DataFrame) -> pd.DataFrame:
                     files={"data": ("data.csv", buf.getvalue(), "text/csv")},
                     data={
                         "columns": ["adresse", "code_postal", "commune"],
-                        "postcode": "code_postal",
+                        "citycode": "code_insee",
                     },
                     timeout=180,  # we upload 2MB of data, so we need a high timeout
                 )
@@ -51,6 +51,10 @@ def _geocode(self, df: pd.DataFrame) -> pd.DataFrame:
                 sep="|",
             )
             results_df = results_df.replace({np.nan: None})
+            # In some cases (ex: address='20' and city='Paris'), the BAN API will return
+            # a municipality as a result with a very high score. This is be discarded
+            # as this will not be valuable information to localize a structure.
+            results_df = results_df[results_df.result_type != "municipality"]
 
         logger.info("Got result for address batch, dimensions=%s", results_df.shape)
         return results_df
@@ -61,11 +65,24 @@ def geocode(self, df: pd.DataFrame) -> pd.DataFrame:
         # since we also want to avoid upload timeouts.
         BATCH_SIZE = 20_000
 
-        # drop rows with missing input values
-        # if not done, the BAN api will fail the entire batch
-        df = df.dropna(subset=["adresse", "code_postal", "commune"], how="all")
+        # drop rows with missing adresses: no need to even try.
+        df = df.dropna(subset=["adresse"])
+        # also drop rows that have not at least one commune, code_insee or code_postal
+        # as the result can't make sense.
+        df = df.dropna(subset=["code_postal", "code_insee", "commune"], thresh=2)
         df = df.sort_values(by="_di_surrogate_id")
-        df = df.assign(adresse=df.adresse.str.replace("-", " "))
+        # Cleanup the values a bit to help the BAN's scoring.
+        # Looking for "Ville-Nouvelle" returns worse results than "Ville Nouvelle"
+        # In the same fashion, looking for "U.R.S.S." returns worse results than "URSS".
+        df = df.assign(
+            adresse=(
+                df.adresse.str.strip()
+                .replace("\r", " ")
+                .replace("-", " ")
+                .replace(".", "")
+            ),
+            commune=df.commune.str.strip(),
+        )
 
         logger.info(f"Only {len(df)} rows can be geocoded.")
 

diff --git a/pipeline/dags/main.py b/pipeline/dags/main.py
@@ -42,6 +42,7 @@ def _geocode():
                 _di_surrogate_id,
                 adresse,
                 code_postal,
+                code_insee,
                 commune
             FROM public_intermediate.int__union_adresses;
         """