diff --git a/datawarehouse/.dockerignore b/datawarehouse/.dockerignore index 7a4893146..4dee7d3a5 100644 --- a/datawarehouse/.dockerignore +++ b/datawarehouse/.dockerignore @@ -1,4 +1,15 @@ * !/docker-entrypoint-initdb.d -!/requirements \ No newline at end of file +!/processings + +# Python +**/*.py[cod] +**/__pycache__/ +**/.venv/ +**/build/ +**/dist/ +**/.tox/ +**/.pytest_cache/ +**/*.egg-info/ +**/db.sqlite3 diff --git a/datawarehouse/Dockerfile b/datawarehouse/Dockerfile index 348d60818..2a53b5ad9 100644 --- a/datawarehouse/Dockerfile +++ b/datawarehouse/Dockerfile @@ -17,10 +17,13 @@ RUN apt-get update \ && apt-get clean -y \ && rm -rf /var/lib/apt/lists/* +RUN python3.11 -m venv ${VIRTUAL_ENV} +RUN pip install --no-cache-dir --upgrade pip setuptools wheel + COPY ./docker-entrypoint-initdb.d /docker-entrypoint-initdb.d -RUN python3.11 -m venv ${VIRTUAL_ENV} +COPY processings/requirements processings/requirements +RUN pip install --no-cache-dir -r processings/requirements/requirements.txt -COPY requirements requirements -RUN pip install --no-cache-dir --upgrade pip setuptools wheel -RUN pip install --no-cache-dir -r requirements/requirements.txt +COPY processings processings +RUN pip install --no-cache-dir -e processings diff --git a/datawarehouse/processings/Makefile b/datawarehouse/processings/Makefile new file mode 100644 index 000000000..e12d4bdc3 --- /dev/null +++ b/datawarehouse/processings/Makefile @@ -0,0 +1,18 @@ +PIP_COMPILE := pipx run uv pip compile pyproject.toml --quiet + +ifeq ($(filter upgrade,$(MAKECMDGOALS)),upgrade) +PIP_COMPILE += --upgrade +endif + +.PHONY: all dev base test uv upgrade + +all: base dev test + +base: + $(PIP_COMPILE) --output-file=requirements/requirements.txt + +dev: + $(PIP_COMPILE) --extra=dev --output-file=requirements/dev-requirements.txt + +test: + $(PIP_COMPILE) --extra=test --output-file=requirements/test-requirements.txt diff --git a/datawarehouse/processings/pyproject.toml b/datawarehouse/processings/pyproject.toml new file mode 100644 index 000000000..98c5ca38f --- /dev/null +++ b/datawarehouse/processings/pyproject.toml @@ -0,0 +1,46 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools", "wheel"] + +[project] +name = "data-inclusion-processings" +version = "0.1.0" +dependencies = [ + "numpy~=2.0", + "pandas~=2.2", + "requests~=2.31", + "tenacity", +] + +[project.optional-dependencies] +dev = [ + "pre-commit", + "ruff", +] +test = [ + "pytest", +] + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.ruff.lint] +# see prefixes in https://beta.ruff.rs/docs/rules/ +select = [ + "F", # pyflakes + "E", # pycodestyle errors + "W", # pycodestyle warnings + "I", # isort + "UP", # pyupgrade + "S", # bandit +] + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["data_inclusion"] + +[tool.pytest.ini_options] +testpaths = "tests" +markers = ''' + ban_api: mark test as requiring the base base adresse nationale api +''' diff --git a/datawarehouse/processings/requirements/dev-requirements.txt b/datawarehouse/processings/requirements/dev-requirements.txt new file mode 100644 index 000000000..45d049a96 --- /dev/null +++ b/datawarehouse/processings/requirements/dev-requirements.txt @@ -0,0 +1,48 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml --extra=dev --output-file=requirements/dev-requirements.txt +certifi==2024.8.30 + # via requests +cfgv==3.4.0 + # via pre-commit +charset-normalizer==3.3.2 + # via requests +distlib==0.3.8 + # via virtualenv +filelock==3.16.0 + # via virtualenv +identify==2.6.0 + # via pre-commit +idna==3.8 + # via requests +nodeenv==1.9.1 + # via pre-commit +numpy==2.1.1 + # via + # data-inclusion-processings (pyproject.toml) + # pandas +pandas==2.2.2 + # via data-inclusion-processings (pyproject.toml) +platformdirs==4.3.2 + # via virtualenv +pre-commit==3.8.0 + # via data-inclusion-processings (pyproject.toml) +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.2 + # via pandas +pyyaml==6.0.2 + # via pre-commit +requests==2.32.3 + # via data-inclusion-processings (pyproject.toml) +ruff==0.6.4 + # via data-inclusion-processings (pyproject.toml) +six==1.16.0 + # via python-dateutil +tenacity==9.0.0 + # via data-inclusion-processings (pyproject.toml) +tzdata==2024.1 + # via pandas +urllib3==2.2.2 + # via requests +virtualenv==20.26.4 + # via pre-commit diff --git a/datawarehouse/processings/requirements/requirements.txt b/datawarehouse/processings/requirements/requirements.txt new file mode 100644 index 000000000..c4e17e6b9 --- /dev/null +++ b/datawarehouse/processings/requirements/requirements.txt @@ -0,0 +1,28 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml --output-file=requirements/requirements.txt +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.8 + # via requests +numpy==2.1.1 + # via + # data-inclusion-processings (pyproject.toml) + # pandas +pandas==2.2.2 + # via data-inclusion-processings (pyproject.toml) +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.2 + # via pandas +requests==2.32.3 + # via data-inclusion-processings (pyproject.toml) +six==1.16.0 + # via python-dateutil +tenacity==9.0.0 + # via data-inclusion-processings (pyproject.toml) +tzdata==2024.1 + # via pandas +urllib3==2.2.2 + # via requests diff --git a/datawarehouse/processings/requirements/test-requirements.txt b/datawarehouse/processings/requirements/test-requirements.txt new file mode 100644 index 000000000..24517c072 --- /dev/null +++ b/datawarehouse/processings/requirements/test-requirements.txt @@ -0,0 +1,36 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml --extra=test --output-file=requirements/test-requirements.txt +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.8 + # via requests +iniconfig==2.0.0 + # via pytest +numpy==2.1.1 + # via + # data-inclusion-processings (pyproject.toml) + # pandas +packaging==24.1 + # via pytest +pandas==2.2.2 + # via data-inclusion-processings (pyproject.toml) +pluggy==1.5.0 + # via pytest +pytest==8.3.3 + # via data-inclusion-processings (pyproject.toml) +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.2 + # via pandas +requests==2.32.3 + # via data-inclusion-processings (pyproject.toml) +six==1.16.0 + # via python-dateutil +tenacity==9.0.0 + # via data-inclusion-processings (pyproject.toml) +tzdata==2024.1 + # via pandas +urllib3==2.2.2 + # via requests diff --git a/datawarehouse/processings/src/data_inclusion/processings/__init__.py b/datawarehouse/processings/src/data_inclusion/processings/__init__.py new file mode 100644 index 000000000..92b402191 --- /dev/null +++ b/datawarehouse/processings/src/data_inclusion/processings/__init__.py @@ -0,0 +1,5 @@ +from data_inclusion.processings.geocode import geocode + +__all__ = [ + "geocode", +] diff --git a/datawarehouse/processings/src/data_inclusion/processings/geocode.py b/datawarehouse/processings/src/data_inclusion/processings/geocode.py new file mode 100644 index 000000000..4faa31e99 --- /dev/null +++ b/datawarehouse/processings/src/data_inclusion/processings/geocode.py @@ -0,0 +1,102 @@ +import csv +import io +import logging +from dataclasses import dataclass + +import numpy as np +import pandas as pd +import requests + +logger = logging.getLogger(__name__) + + +def _geocode(df: pd.DataFrame) -> pd.DataFrame: + logger.info("Will send address batch, dimensions=%s", df.shape) + with io.BytesIO() as buf: + df.to_csv(buf, index=False, quoting=csv.QUOTE_ALL, sep="|") + + try: + response = requests.post( + "https://api-adresse.data.gouv.fr/search/csv/", + files={"data": ("data.csv", buf.getvalue(), "text/csv")}, + data={ + "columns": ["adresse", "code_postal", "commune"], + # Post-filter on the INSEE code and not the zipcode. + # Explanations from the BAN API creators: + # The postcode is problematic for cities with multiple zipcodes + # if the supplied zipcode is wrong, or the one in the BAN is. + # The INSEE code is more stable, unique and reliable. + # Also this post-filter does not return "possible" results, + # it blindly filters-out. + "postcode": "code_postal", + }, + timeout=180, # we upload 2MB of data, so we need a high timeout + ) + response.raise_for_status() + except requests.RequestException as e: + logger.info("Error while fetching `%s`: %s", e.request.url, e) + return pd.DataFrame() + + with io.StringIO() as f: + f.write(response.text) + f.seek(0) + results_df = pd.read_csv( + f, + encoding_errors="replace", + on_bad_lines="warn", + dtype=str, + sep="|", + ) + results_df = results_df.replace({np.nan: None}) + + logger.info("Got result for address batch, dimensions=%s", results_df.shape) + return results_df + + +@dataclass +class GeocodeInput: + id: str + adresse: str + code_insee: str + code_postal: str + commune: str + + +def geocode( + data: GeocodeInput | list[GeocodeInput], + batch_size: int = 20_000, +) -> pd.DataFrame: + # BAN api limits the batch geocoding to 50MB of data + # In our tests, 10_000 rows is about 1MB; but we'll be conservative + # since we also want to avoid upload timeouts. + + data = data if isinstance(data, list) else [data] + df = pd.DataFrame.from_records(data) + + # drop rows that have not at least one commune, code_insee or code_postal + # as the result can't make sense. + # Note that we keep the rows without an address, as they can be used to + # at least resolve the city. + df = df.dropna(subset=["code_postal", "code_insee", "commune"], thresh=2) + + # Cleanup the values a bit to help the BAN's scoring. After experimentation, + # looking for "Ville-Nouvelle" returns worse results than "Ville Nouvelle", + # probably due to a tokenization in the BAN that favors spaces. + # In the same fashion, looking for "U.R.S.S." returns worse results than using + # "URSS" for "Avenue de l'U.R.S.S.". With the dots, it does not find the + # street at all ¯\_(ツ)_/¯ + df = df.assign( + adresse=df.adresse.str.strip().replace("-", " ").replace(".", ""), + commune=df.commune.str.strip(), + ) + + logger.info(f"Only {len(df)} rows can be geocoded.") + + return ( + df.groupby( + np.arange(len(df)) // batch_size, + group_keys=False, + ) + .apply(_geocode) + .to_dict(orient="records") + ) diff --git a/datawarehouse/processings/tests/integration/__init__.py b/datawarehouse/processings/tests/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/datawarehouse/processings/tests/integration/test_geocoding.py b/datawarehouse/processings/tests/integration/test_geocoding.py new file mode 100644 index 000000000..da17ac140 --- /dev/null +++ b/datawarehouse/processings/tests/integration/test_geocoding.py @@ -0,0 +1,62 @@ +from unittest.mock import ANY + +import pytest + +from data_inclusion.processings import geocode + +pytestmark = pytest.mark.ban_api + + +@pytest.mark.parametrize( + ("adresse", "expected"), + [ + ( + { + "id": "1", + "adresse": "17 rue Malus", + "code_postal": "59000", + "code_insee": "59350", + "commune": "Lille", + }, + { + "id": "1", + "adresse": "17 rue Malus", + "code_postal": "59000", + "code_insee": "59350", + "commune": "Lille", + "latitude": "50.627078", + "longitude": "3.067372", + "result_label": "17 Rue Malus 59000 Lille", + "result_score": ANY, + "result_score_next": None, + "result_type": "housenumber", + "result_id": "59350_5835_00017", + "result_housenumber": "17", + "result_name": "17 Rue Malus", + "result_street": "Rue Malus", + "result_postcode": "59000", + "result_city": "Lille", + "result_context": "59, Nord, Hauts-de-France", + "result_citycode": "59350", + "result_oldcitycode": "59350", + "result_oldcity": "Lille", + "result_district": None, + "result_status": "ok", + }, + ), + ( + { + "id": "2", + "adresse": None, + "code_postal": None, + "code_insee": None, + "commune": None, + }, + None, + ), + ], +) +def test_ban_geocode(adresse: dict, expected: dict): + result_df = geocode(data=adresse) + + assert result_df == ([expected] if expected is not None else []) diff --git a/datawarehouse/requirements/requirements.in b/datawarehouse/requirements/requirements.in deleted file mode 100644 index 3d535537b..000000000 --- a/datawarehouse/requirements/requirements.in +++ /dev/null @@ -1,2 +0,0 @@ -requests~=2.31 -trafilatura~=1.6 \ No newline at end of file diff --git a/datawarehouse/requirements/requirements.txt b/datawarehouse/requirements/requirements.txt deleted file mode 100644 index f8a7bf25f..000000000 --- a/datawarehouse/requirements/requirements.txt +++ /dev/null @@ -1,56 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile requirements/requirements.in -# -certifi==2023.7.22 - # via - # requests - # trafilatura -charset-normalizer==3.2.0 - # via - # htmldate - # requests - # trafilatura -courlan==0.9.4 - # via trafilatura -dateparser==1.1.8 - # via htmldate -htmldate==1.5.1 - # via trafilatura -idna==3.4 - # via requests -justext==3.0.0 - # via trafilatura -langcodes==3.3.0 - # via courlan -lxml==4.9.3 - # via - # htmldate - # justext - # trafilatura -python-dateutil==2.8.2 - # via - # dateparser - # htmldate -pytz==2023.3.post1 - # via dateparser -regex==2023.8.8 - # via dateparser -requests==2.31.0 - # via -r requirements/requirements.in -six==1.16.0 - # via python-dateutil -tld==0.13 - # via courlan -trafilatura==1.6.2 - # via -r requirements/requirements.in -tzlocal==5.0.1 - # via dateparser -urllib3==2.0.4 - # via - # courlan - # htmldate - # requests - # trafilatura diff --git a/pipeline/dbt/macros/create_udfs.sql b/pipeline/dbt/macros/create_udfs.sql index 4e875d1ce..4e45efba0 100644 --- a/pipeline/dbt/macros/create_udfs.sql +++ b/pipeline/dbt/macros/create_udfs.sql @@ -9,6 +9,10 @@ Another way would be to use the `on-run-start` hook, but it does not play nicely {% set sql %} +CREATE SCHEMA IF NOT EXISTS processings; + +{{ udf_geocode() }} + {{ create_udf_soliguide__new_hours_to_osm_opening_hours() }} {{ create_udf__common_checks() }} {{ create_udf__service_checks() }} diff --git a/pipeline/dbt/macros/udfs/udf__geocode.sql b/pipeline/dbt/macros/udfs/udf__geocode.sql new file mode 100644 index 000000000..722ba9101 --- /dev/null +++ b/pipeline/dbt/macros/udfs/udf__geocode.sql @@ -0,0 +1,33 @@ +{% macro udf_geocode() %} + +DROP FUNCTION IF EXISTS processings.geocode; + +CREATE OR REPLACE FUNCTION processings.geocode(data JSONB) +RETURNS + TABLE( + id TEXT, + result_score FLOAT, + result_label TEXT, + result_city TEXT, + result_type TEXT, + result_citycode TEXT, + result_postcode TEXT, + result_name TEXT, + longitude FLOAT, + latitude FLOAT + ) +AS $$ + +import json + +from data_inclusion import processings + +return ( + processings.geocode(data=json.loads(data)) + if data is not None + else [] +) + +$$ LANGUAGE plpython3u; + +{% endmacro %} diff --git a/pipeline/dbt/models/intermediate/_models.yml b/pipeline/dbt/models/intermediate/_models.yml index 6675631fd..4e8b7d404 100644 --- a/pipeline/dbt/models/intermediate/_models.yml +++ b/pipeline/dbt/models/intermediate/_models.yml @@ -69,3 +69,105 @@ models: * geocoded addresses * email with pii flag + + - name: int__geocodages + description: | + Geocoding results for all sources. + + This model is incremental, it will only geocode new or changed addresses. + It stores raw geocoding results, without filtering. + + Geocoding is done by calling the BAN api in PL/Python. + columns: + - name: geocoded_at + data_tests: + - not_null + - name: adresse_id + data_tests: + - unique + - not_null + - relationships: + to: ref('int__union_adresses') + field: _di_surrogate_id + - name: input_adresse + - name: input_code_insee + - name: input_code_postal + - name: input_commune + - name: commune + data_tests: + - dbt_utils.not_empty_string + - name: adresse + data_tests: + - dbt_utils.not_empty_string + - name: code_postal + data_tests: + - dbt_utils.not_empty_string + - name: code_insee + data_tests: + - dbt_utils.not_empty_string + - name: latitude + - name: longitude + - name: score + data_tests: + - not_null: + config: + severity: warn + - name: type + data_tests: + - not_null: + config: + severity: warn + - accepted_values: + values: + - housenumber + - street + - locality + - municipality + +unit_tests: + - name: test_geocodages_full_refresh_mode + model: int__geocodages + overrides: + macros: + is_incremental: false + given: + - input: ref('int__union_adresses') + rows: + - {_di_surrogate_id: foo, adresse: 17 rue Malus, commune: Lille, code_postal: 59000, code_insee: 59350} + expect: + rows: + - {adresse_id: foo} + + - name: test_geocodages_incremental_mode + # - row `unchanged` was previously geocoded and has not changed. It must not be re-geocoded. + # - rows `changed-` were previously geocoded, but INPUT have changed. They must be re-geocoded. + # - row `new` was not previously geocoded. It must be geocoded. + model: int__geocodages + description: | + Test that geocoding is incremental + overrides: + macros: + is_incremental: true + given: + - input: ref('int__union_adresses') + rows: + - {_di_surrogate_id: unchanged, adresse: 17 rue Malus, code_postal: 59000, code_insee: 59350, commune: Lille} + - {_di_surrogate_id: changed-adresse, adresse: changed, code_postal: 59000, code_insee: 59350, commune: Lille} + - {_di_surrogate_id: changed-code-postal, adresse: 17 rue Malus, code_postal: changed, code_insee: 59350, commune: Lille} + - {_di_surrogate_id: changed-code-insee, adresse: 17 rue Malus, code_postal: 59000, code_insee: changed, commune: Lille} + - {_di_surrogate_id: changed-commune, adresse: 17 rue Malus, code_postal: 59000, code_insee: 59350, commune: changed} + - {_di_surrogate_id: new, adresse: 17 rue Malus, code_postal: 59000, code_insee: 59350, commune: Lille} + - input: this + rows: + - {adresse_id: unchanged, input_adresse: 17 rue Malus, input_code_postal: 59000, input_code_insee: 59350, input_commune: Lille} + - {adresse_id: changed-adresse, input_adresse: 17 rue Malus, input_code_postal: 59000, input_code_insee: 59350, input_commune: Lille} + - {adresse_id: changed-code-postal, input_adresse: 17 rue Malus, input_code_postal: 59000, input_code_insee: 59350, input_commune: Lille} + - {adresse_id: changed-code-insee, input_adresse: 17 rue Malus, input_code_postal: 59000, input_code_insee: 59350, input_commune: Lille} + - {adresse_id: changed-commune, input_adresse: 17 rue Malus, input_code_postal: 59000, input_code_insee: 59350, input_commune: Lille} + expect: + rows: + - {adresse_id: changed-adresse} + - {adresse_id: changed-code-postal} + - {adresse_id: changed-code-insee} + - {adresse_id: changed-commune} + - {adresse_id: new} diff --git a/pipeline/dbt/models/intermediate/int__geocodages.sql b/pipeline/dbt/models/intermediate/int__geocodages.sql new file mode 100644 index 000000000..bee2ba357 --- /dev/null +++ b/pipeline/dbt/models/intermediate/int__geocodages.sql @@ -0,0 +1,59 @@ +{{ + config( + materialized="incremental", + unique_key="adresse_id", + ) +}} + +WITH adresses AS ( + SELECT * FROM {{ ref('int__union_adresses') }} +), + +final AS ( + SELECT + CAST('{{ run_started_at }}' AS TIMESTAMP) AS "geocoded_at", + adresses._di_surrogate_id AS "adresse_id", + adresses.adresse AS "input_adresse", + adresses.code_postal AS "input_code_postal", + adresses.code_insee AS "input_code_insee", + adresses.commune AS "input_commune", + geocodings.result_city AS "commune", + geocodings.result_name AS "adresse", + geocodings.result_postcode AS "code_postal", + geocodings.result_citycode AS "code_insee", + geocodings.result_score AS "score", + geocodings.result_type AS "type", + geocodings.longitude AS "longitude", + geocodings.latitude AS "latitude" + FROM + adresses + INNER JOIN processings.geocode( + ( + SELECT + JSONB_AGG( + JSONB_OBJECT( + ARRAY[ + 'id', adresses._di_surrogate_id, + 'adresse', adresses.adresse, + 'code_postal', adresses.code_postal, + 'code_insee', adresses.code_insee, + 'commune', adresses.commune + ] + ) + ) + FROM adresses + {% if is_incremental() %} + -- then only geocode new or changed rows + LEFT JOIN {{ this }} ON adresses._di_surrogate_id = {{ this }}.adresse_id + WHERE + {{ this }}.adresse_id IS NULL + OR {{ this }}.input_adresse != adresses.adresse + OR {{ this }}.input_code_postal != adresses.code_postal + OR {{ this }}.input_code_insee != adresses.code_insee + OR {{ this }}.input_commune != adresses.commune + {% endif %} + ) + ) AS geocodings ON adresses._di_surrogate_id = geocodings.id +) + +SELECT * FROM final diff --git a/pipeline/dbt/models/intermediate/int__union_adresses__enhanced.sql b/pipeline/dbt/models/intermediate/int__union_adresses__enhanced.sql index cb534f270..11a14d77f 100644 --- a/pipeline/dbt/models/intermediate/int__union_adresses__enhanced.sql +++ b/pipeline/dbt/models/intermediate/int__union_adresses__enhanced.sql @@ -2,51 +2,32 @@ WITH adresses AS ( SELECT * FROM {{ ref('int__union_adresses') }} ), -valid_adresses AS ( - SELECT adresses.* - FROM adresses - LEFT JOIN - LATERAL - LIST_ADRESSE_ERRORS( - adresses.adresse, - adresses.code_insee, - adresses.code_postal, - adresses.commune, - adresses.complement_adresse, - adresses.id, - adresses.latitude, - adresses.longitude, - adresses.source - ) AS errors ON TRUE - WHERE errors.field IS NULL -), - -geocoded_results AS ( - SELECT * FROM {{ ref('int_extra__geocoded_results') }} +geocodages AS ( + SELECT * FROM {{ ref('int__geocodages') }} ), final AS ( SELECT - valid_adresses._di_surrogate_id AS "_di_surrogate_id", - valid_adresses.id AS "id", - valid_adresses.source AS "source", - valid_adresses.complement_adresse AS "complement_adresse", + adresses._di_surrogate_id AS "_di_surrogate_id", + adresses.id AS "id", + adresses.source AS "source", + adresses.complement_adresse AS "complement_adresse", CASE - WHEN geocoded_results.result_type = 'municipality' - THEN valid_adresses.adresse - ELSE COALESCE(geocoded_results.result_name, valid_adresses.adresse) - END AS "adresse", - COALESCE(geocoded_results.longitude, valid_adresses.longitude) AS "longitude", - COALESCE(geocoded_results.latitude, valid_adresses.latitude) AS "latitude", - COALESCE(geocoded_results.result_city, valid_adresses.commune) AS "commune", - COALESCE(geocoded_results.result_postcode, valid_adresses.code_postal) AS "code_postal", - COALESCE(geocoded_results.result_citycode, valid_adresses.code_insee) AS "code_insee", - geocoded_results.result_score AS "result_score" - FROM valid_adresses - LEFT JOIN geocoded_results + WHEN geocodages.type = 'municipality' + THEN adresses.adresse + ELSE COALESCE(geocodages.adresse, adresses.adresse) + END AS "adresse", + COALESCE(geocodages.longitude, adresses.longitude) AS "longitude", + COALESCE(geocodages.latitude, adresses.latitude) AS "latitude", + COALESCE(geocodages.commune, adresses.commune) AS "commune", + COALESCE(geocodages.code_postal, adresses.code_postal) AS "code_postal", + COALESCE(geocodages.code_insee, adresses.code_insee) AS "code_insee", + geocodages.score AS "score_geocodage" + FROM adresses + LEFT JOIN geocodages ON - valid_adresses._di_surrogate_id = geocoded_results._di_surrogate_id - AND geocoded_results.result_score >= 0.8 + adresses._di_surrogate_id = geocodages.adresse_id + AND geocodages.score >= 0.8 ) SELECT * FROM final