From 80b8a5a28371feb6df31d55bcc2617948a5f9b1a Mon Sep 17 00:00:00 2001 From: Lino Galiana Date: Wed, 6 Sep 2023 10:32:20 +0200 Subject: [PATCH] =?UTF-8?q?Restructuration=20pour=20isoler=20la=20producti?= =?UTF-8?q?on=20des=20chemins=20standardis=C3=A9s=20(#74)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * commence restructuration * simplifie briques * test * import dans le module * modification dans s3 aussi * black --- cartiflette/s3/s3.py | 182 +++++++++--------------- cartiflette/utils/__init__.py | 2 + cartiflette/utils/create_path_bucket.py | 48 +++++++ tests/mockups.py | 19 ++- tests/test_create_path_bucket.py | 23 +++ 5 files changed, 159 insertions(+), 115 deletions(-) create mode 100644 cartiflette/utils/create_path_bucket.py create mode 100644 tests/test_create_path_bucket.py diff --git a/cartiflette/s3/s3.py b/cartiflette/s3/s3.py index e83ddb04..7be531d5 100644 --- a/cartiflette/s3/s3.py +++ b/cartiflette/s3/s3.py @@ -17,6 +17,7 @@ create_format_standardized, create_format_driver, download_pb, + create_path_bucket, official_epsg_codes, ) @@ -111,16 +112,18 @@ def create_url_s3( """ path_within = create_path_bucket( - bucket=bucket, - path_within_bucket=path_within_bucket, - provider=provider, - source=source, - vectorfile_format=vectorfile_format, - borders=borders, - filter_by=filter_by, - year=year, - crs=crs, - value=value, + { + "bucket": bucket, + "path_within_bucket": path_within_bucket, + "provider": provider, + "source": source, + "vectorfile_format": vectorfile_format, + "borders": borders, + "filter_by": filter_by, + "year": year, + "crs": crs, + "value": value, + } ) url = f"{ENDPOINT_URL}/{path_within}" @@ -130,57 +133,6 @@ def create_url_s3( return url -def create_path_bucket( - bucket: str = BUCKET, - path_within_bucket: str = PATH_WITHIN_BUCKET, - provider: str = "IGN", - source: str = "EXPRESS-COG-TERRITOIRE", - vectorfile_format: str = "geojson", - borders: str = "COMMUNE", - filter_by: str = "region", - year: typing.Union[str, int, float] = "2022", - value: typing.Union[str, int, float] = "28", - crs: typing.Union[str, int, float] = 2154, -) -> str: - """ - This function creates a file path for a vector file within a specified bucket. - - Parameters: - bucket (str): The name of the bucket where the file will be stored. - path_within_bucket (str): The path within the bucket where the file will be stored. - vectorfile_format (str): The format of the vector file, - can be "geojson", "topojson", "gpkg" or "shp". Default is "geojson". - borders (str): The administrative borders of the tiles within the vector file. - Can be any administrative - borders provided by IGN, e.g. "COMMUNE", "DEPARTEMENT" or "REGION". Default is "COMMUNE". - filter_by (str): The administrative borders (supra to 'borders') that will be - used to cut the vector file in pieces when writing to S3. For instance, if - borders is "DEPARTEMENT", filter_by can be "REGION" or "FRANCE_ENTIERE". - Default is "region". - year (str): The year of the vector file. Default is "2022". - value (str): The value of the vector file. Default is "28". - crs (int): The coordinate reference system of the vector file. Default is 2154. - - Returns: - str: The complete file path for the vector file that will be used to read - or write when interacting with S3 storage - """ - - write_path = f"{bucket}/{path_within_bucket}" - write_path = f"{write_path}/{year=}" - write_path = f"{write_path}/administrative_level={borders}" - write_path = f"{write_path}/{crs=}" - write_path = f"{write_path}/{filter_by}={value}/{vectorfile_format=}" - write_path = f"{write_path}/{provider=}/{source=}" - write_path = f"{write_path}/raw.{vectorfile_format}" - write_path = write_path.replace("'", "") - - if vectorfile_format == "shp": - write_path = write_path.rsplit("/", maxsplit=1)[0] + "/" - - return write_path - - # DOWNLOAD FROM S3 -------------------------- @@ -299,16 +251,18 @@ def download_vectorfile_s3_single( ) read_path = create_path_bucket( - bucket=bucket, - path_within_bucket=path_within_bucket, - vectorfile_format=format_read, - borders=borders, - filter_by=filter_by, - year=year, - value=value, - crs=crs, - provider=provider, - source=source, + { + "bucket": bucket, + "path_within_bucket": path_within_bucket, + "vectorfile_format": format_read, + "borders": borders, + "filter_by": filter_by, + "year": year, + "value": value, + "crs": crs, + "provider": provider, + "source": source, + } ) try: @@ -321,9 +275,7 @@ def download_vectorfile_s3_single( print("When using shp format, we first need to store a local version") tdir = tempfile.TemporaryDirectory() for remote_file in fs.ls(dir_s3): - fs.download( - remote_file, - f"{tdir.name}/{remote_file.replace(dir_s3, '')}") + fs.download(remote_file, f"{tdir.name}/{remote_file.replace(dir_s3, '')}") object = gpd.read_file(f"{tdir.name}/raw.shp", driver=None) elif format_read == "parquet": with fs.open(read_path, "rb") as f: @@ -408,16 +360,18 @@ def write_cog_s3(year: int = 2022, vectorfile_format="json"): dict_path_data = { create_path_bucket( - bucket=BUCKET, - path_within_bucket=PATH_WITHIN_BUCKET, - provider="INSEE", - source="COG", - vectorfile_format=vectorfile_format, - borders=level, - filter_by="france_entiere", - year=year, - value="raw", - crs=None, + { + "bucket": BUCKET, + "path_within_bucket": PATH_WITHIN_BUCKET, + "provider": "INSEE", + "source": "COG", + "vectorfile_format": vectorfile_format, + "borders": level, + "filter_by": "france_entiere", + "year": year, + "value": "raw", + "crs": None, + } ): value for level, value in list_cog.items() } @@ -482,16 +436,18 @@ def write_vectorfile_subset( ) write_path = create_path_bucket( - bucket=bucket, - path_within_bucket=path_within_bucket, - provider=provider, - source=source, - vectorfile_format=format_write, - borders=borders, - filter_by=filter_by, - year=year, - value=value, - crs=crs, + { + "bucket": bucket, + "path_within_bucket": path_within_bucket, + "provider": provider, + "source": source, + "vectorfile_format": format_write, + "borders": borders, + "filter_by": filter_by, + "year": year, + "value": value, + "crs": crs, + } ) print(f"Writing file at {write_path} location") @@ -565,12 +521,7 @@ def duplicate_vectorfile_ign( - None: The function does not raise any exceptions explicitly. """ - combinations = list( - itertools.product( - sources, - territories, - years, - providers)) + combinations = list(itertools.product(sources, territories, years, providers)) paths = dict(ChainMap(*[structure_path_raw_ign(c) for c in combinations])) @@ -751,11 +702,13 @@ def write_vectorfile_s3_all( def open_vectorfile_from_s3(vectorfile_format, filter_by, year, value, crs): read_path = create_path_bucket( - vectorfile_format=vectorfile_format, - filter_by=filter_by, - year=year, - value=value, - crs=crs, + { + "vectorfile_format": vectorfile_format, + "filter_by": filter_by, + "year": year, + "value": value, + "crs": crs, + } ) return fs.open(read_path, mode="r") @@ -781,13 +734,15 @@ def write_vectorfile_from_s3( """ read_path = create_path_bucket( - vectorfile_format=vectorfile_format, - filter_by=filter_by, - year=year, - value=value, - crs=crs, - provider=provider, - source=source, + { + "vectorfile_format": vectorfile_format, + "filter_by": filter_by, + "year": year, + "value": value, + "crs": crs, + "provider": provider, + "source": source, + } ) fs.download(read_path, filename) @@ -939,8 +894,7 @@ def create_nested_topojson(path): for couple in croisement_filter_by_borders_flat: borders = couple[0] filter_by = couple[1] - list_output[borders] = create_territories( - borders=borders, filter_by=filter_by) + list_output[borders] = create_territories(borders=borders, filter_by=filter_by) topo = Topology( data=[ diff --git a/cartiflette/utils/__init__.py b/cartiflette/utils/__init__.py index 9dc0adc2..b03a30e4 100644 --- a/cartiflette/utils/__init__.py +++ b/cartiflette/utils/__init__.py @@ -9,6 +9,7 @@ official_epsg_codes, ) from .keep_subset_geopandas import keep_subset_geopandas +from .create_path_bucket import create_path_bucket from .hash import hash_file from .dict_update import deep_dict_update @@ -22,4 +23,5 @@ "keep_subset_geopandas", "hash_file", "deep_dict_update", + "create_path_bucket" ] diff --git a/cartiflette/utils/create_path_bucket.py b/cartiflette/utils/create_path_bucket.py new file mode 100644 index 00000000..72eb7790 --- /dev/null +++ b/cartiflette/utils/create_path_bucket.py @@ -0,0 +1,48 @@ +"""Module for communication with Minio S3 Storage +""" + +from typing import Dict, Union + +BUCKET = "projet-cartiflette" +PATH_WITHIN_BUCKET = "diffusion/shapefiles-test1" +ENDPOINT_URL = "https://minio.lab.sspcloud.fr" + +# CREATE STANDARDIZED PATHS ------------------------ + + +def create_path_bucket(config: Dict[str, Union[str, int, float]]) -> str: + """ + This function creates a file path for a vector file within a specified bucket. + + Parameters: + config (Dict[str, Union[str, int, float]]): A dictionary containing vector file parameters. + + Returns: + str: The complete file path for the vector file that will be used to read + or write when interacting with S3 storage. + """ + + bucket = config.get("bucket", BUCKET) + path_within_bucket = config.get("path_within_bucket", PATH_WITHIN_BUCKET) + provider = config.get("provider", "IGN") + source = config.get("source", "EXPRESS-COG-TERRITOIRE") + vectorfile_format = config.get("vectorfile_format", "geojson") + borders = config.get("borders", "COMMUNE") + filter_by = config.get("filter_by", "region") + year = config.get("year", "2022") + value = config.get("value", "28") + crs = config.get("crs", 2154) + + write_path = f"{bucket}/{path_within_bucket}" + write_path = f"{write_path}/{year=}" + write_path = f"{write_path}/administrative_level={borders}" + write_path = f"{write_path}/{crs=}" + write_path = f"{write_path}/{filter_by}={value}/{vectorfile_format=}" + write_path = f"{write_path}/{provider=}/{source=}" + write_path = f"{write_path}/raw.{vectorfile_format}" + write_path = write_path.replace("'", "") + + if vectorfile_format == "shp": + write_path = write_path.rsplit("/", maxsplit=1)[0] + "/" + + return write_path diff --git a/tests/mockups.py b/tests/mockups.py index 34f3b3c8..bd60f92d 100644 --- a/tests/mockups.py +++ b/tests/mockups.py @@ -13,8 +13,11 @@ FILESIZE_DUMMY, CONTENT_DUMMY, ) +from cartiflette.utils import ( + create_path_bucket +) from cartiflette.download import ( - Dataset, + Dataset ) logging.basicConfig(level=logging.INFO) @@ -116,3 +119,17 @@ def mock_get(self, url, *args, **kwargs): monkeypatch.setattr(requests.Session, "head", mock_head) monkeypatch.setattr(requests.Session, "get", mock_get) + + +@pytest.mark.parametrize( + "config, expected_path", + [ + ({"bucket": "my_bucket"}, "my_bucket/PATH_WITHIN_BUCKET/2022/administrative_level=COMMUNE/2154/region=28/vectorfile_format=geojson/provider=IGN/source=EXPRESS-COG-TERRITOIRE/raw.geojson"), + ({"vectorfile_format": "shp"}, "BUCKET/PATH_WITHIN_BUCKET/2022/administrative_level=COMMUNE/2154/region=28/vectorfile_format=shp/provider=IGN/source=EXPRESS-COG-TERRITOIRE/"), + ({"borders": "DEPARTEMENT", "filter_by": "REGION", "year": "2023", "value": "42", "crs": 4326}, "BUCKET/PATH_WITHIN_BUCKET/2023/administrative_level=DEPARTEMENT/4326/REGION=42/geojson/IGN/EXPRESS-COG-TERRITOIRE/raw.geojson"), + ({"path_within_bucket": "data", "vectorfile_format": "gpkg"}, "BUCKET/data/2022/administrative_level=COMMUNE/2154/region=28/gpkg/IGN/EXPRESS-COG-TERRITOIRE/raw.gpkg"), + ], +) +def test_create_path_bucket(config, expected_path): + result = create_path_bucket(config) + assert result == expected_path \ No newline at end of file diff --git a/tests/test_create_path_bucket.py b/tests/test_create_path_bucket.py new file mode 100644 index 00000000..817e9487 --- /dev/null +++ b/tests/test_create_path_bucket.py @@ -0,0 +1,23 @@ +import pytest +from cartiflette.utils import ( + create_path_bucket +) +# Import create_path_bucket function and VectorFileConfig here (if not already imported) + +# Define some test cases with different configurations +@pytest.mark.parametrize( + "config, expected_path", + [ + ({"bucket": "my_bucket"}, 'my_bucket/diffusion/shapefiles-test1/year=2022/administrative_level=COMMUNE/crs=2154/region=28/vectorfile_format=geojson/provider=IGN/source=EXPRESS-COG-TERRITOIRE/raw.geojson'), + ({"vectorfile_format": "shp"}, 'projet-cartiflette/diffusion/shapefiles-test1/year=2022/administrative_level=COMMUNE/crs=2154/region=28/vectorfile_format=shp/provider=IGN/source=EXPRESS-COG-TERRITOIRE/'), + ({"borders": "DEPARTEMENT", "filter_by": "REGION", "year": "2023", "value": "42", "crs": 4326}, 'projet-cartiflette/diffusion/shapefiles-test1/year=2023/administrative_level=DEPARTEMENT/crs=4326/REGION=42/vectorfile_format=geojson/provider=IGN/source=EXPRESS-COG-TERRITOIRE/raw.geojson'), + ({"path_within_bucket": "data", "vectorfile_format": "gpkg"}, 'projet-cartiflette/data/year=2022/administrative_level=COMMUNE/crs=2154/region=28/vectorfile_format=gpkg/provider=IGN/source=EXPRESS-COG-TERRITOIRE/raw.gpkg'), + ], +) +def test_create_path_bucket(config, expected_path): + result = create_path_bucket(config) + assert result == expected_path + +# Run the tests +if __name__ == "__main__": + pytest.main()