Skip to content

Commit

Permalink
Restructuration pour isoler la production des chemins standardisés (#74)
Browse files Browse the repository at this point in the history
* commence restructuration

* simplifie briques

* test

* import dans le module

* modification dans s3 aussi

* black
  • Loading branch information
linogaliana authored Sep 6, 2023
1 parent a5a0886 commit 80b8a5a
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 115 deletions.
182 changes: 68 additions & 114 deletions cartiflette/s3/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
create_format_standardized,
create_format_driver,
download_pb,
create_path_bucket,
official_epsg_codes,
)

Expand Down Expand Up @@ -111,16 +112,18 @@ def create_url_s3(
"""

path_within = create_path_bucket(
bucket=bucket,
path_within_bucket=path_within_bucket,
provider=provider,
source=source,
vectorfile_format=vectorfile_format,
borders=borders,
filter_by=filter_by,
year=year,
crs=crs,
value=value,
{
"bucket": bucket,
"path_within_bucket": path_within_bucket,
"provider": provider,
"source": source,
"vectorfile_format": vectorfile_format,
"borders": borders,
"filter_by": filter_by,
"year": year,
"crs": crs,
"value": value,
}
)

url = f"{ENDPOINT_URL}/{path_within}"
Expand All @@ -130,57 +133,6 @@ def create_url_s3(
return url


def create_path_bucket(
bucket: str = BUCKET,
path_within_bucket: str = PATH_WITHIN_BUCKET,
provider: str = "IGN",
source: str = "EXPRESS-COG-TERRITOIRE",
vectorfile_format: str = "geojson",
borders: str = "COMMUNE",
filter_by: str = "region",
year: typing.Union[str, int, float] = "2022",
value: typing.Union[str, int, float] = "28",
crs: typing.Union[str, int, float] = 2154,
) -> str:
"""
This function creates a file path for a vector file within a specified bucket.
Parameters:
bucket (str): The name of the bucket where the file will be stored.
path_within_bucket (str): The path within the bucket where the file will be stored.
vectorfile_format (str): The format of the vector file,
can be "geojson", "topojson", "gpkg" or "shp". Default is "geojson".
borders (str): The administrative borders of the tiles within the vector file.
Can be any administrative
borders provided by IGN, e.g. "COMMUNE", "DEPARTEMENT" or "REGION". Default is "COMMUNE".
filter_by (str): The administrative borders (supra to 'borders') that will be
used to cut the vector file in pieces when writing to S3. For instance, if
borders is "DEPARTEMENT", filter_by can be "REGION" or "FRANCE_ENTIERE".
Default is "region".
year (str): The year of the vector file. Default is "2022".
value (str): The value of the vector file. Default is "28".
crs (int): The coordinate reference system of the vector file. Default is 2154.
Returns:
str: The complete file path for the vector file that will be used to read
or write when interacting with S3 storage
"""

write_path = f"{bucket}/{path_within_bucket}"
write_path = f"{write_path}/{year=}"
write_path = f"{write_path}/administrative_level={borders}"
write_path = f"{write_path}/{crs=}"
write_path = f"{write_path}/{filter_by}={value}/{vectorfile_format=}"
write_path = f"{write_path}/{provider=}/{source=}"
write_path = f"{write_path}/raw.{vectorfile_format}"
write_path = write_path.replace("'", "")

if vectorfile_format == "shp":
write_path = write_path.rsplit("/", maxsplit=1)[0] + "/"

return write_path


# DOWNLOAD FROM S3 --------------------------


Expand Down Expand Up @@ -299,16 +251,18 @@ def download_vectorfile_s3_single(
)

read_path = create_path_bucket(
bucket=bucket,
path_within_bucket=path_within_bucket,
vectorfile_format=format_read,
borders=borders,
filter_by=filter_by,
year=year,
value=value,
crs=crs,
provider=provider,
source=source,
{
"bucket": bucket,
"path_within_bucket": path_within_bucket,
"vectorfile_format": format_read,
"borders": borders,
"filter_by": filter_by,
"year": year,
"value": value,
"crs": crs,
"provider": provider,
"source": source,
}
)

try:
Expand All @@ -321,9 +275,7 @@ def download_vectorfile_s3_single(
print("When using shp format, we first need to store a local version")
tdir = tempfile.TemporaryDirectory()
for remote_file in fs.ls(dir_s3):
fs.download(
remote_file,
f"{tdir.name}/{remote_file.replace(dir_s3, '')}")
fs.download(remote_file, f"{tdir.name}/{remote_file.replace(dir_s3, '')}")
object = gpd.read_file(f"{tdir.name}/raw.shp", driver=None)
elif format_read == "parquet":
with fs.open(read_path, "rb") as f:
Expand Down Expand Up @@ -408,16 +360,18 @@ def write_cog_s3(year: int = 2022, vectorfile_format="json"):

dict_path_data = {
create_path_bucket(
bucket=BUCKET,
path_within_bucket=PATH_WITHIN_BUCKET,
provider="INSEE",
source="COG",
vectorfile_format=vectorfile_format,
borders=level,
filter_by="france_entiere",
year=year,
value="raw",
crs=None,
{
"bucket": BUCKET,
"path_within_bucket": PATH_WITHIN_BUCKET,
"provider": "INSEE",
"source": "COG",
"vectorfile_format": vectorfile_format,
"borders": level,
"filter_by": "france_entiere",
"year": year,
"value": "raw",
"crs": None,
}
): value
for level, value in list_cog.items()
}
Expand Down Expand Up @@ -482,16 +436,18 @@ def write_vectorfile_subset(
)

write_path = create_path_bucket(
bucket=bucket,
path_within_bucket=path_within_bucket,
provider=provider,
source=source,
vectorfile_format=format_write,
borders=borders,
filter_by=filter_by,
year=year,
value=value,
crs=crs,
{
"bucket": bucket,
"path_within_bucket": path_within_bucket,
"provider": provider,
"source": source,
"vectorfile_format": format_write,
"borders": borders,
"filter_by": filter_by,
"year": year,
"value": value,
"crs": crs,
}
)

print(f"Writing file at {write_path} location")
Expand Down Expand Up @@ -565,12 +521,7 @@ def duplicate_vectorfile_ign(
- None: The function does not raise any exceptions explicitly.
"""

combinations = list(
itertools.product(
sources,
territories,
years,
providers))
combinations = list(itertools.product(sources, territories, years, providers))

paths = dict(ChainMap(*[structure_path_raw_ign(c) for c in combinations]))

Expand Down Expand Up @@ -751,11 +702,13 @@ def write_vectorfile_s3_all(

def open_vectorfile_from_s3(vectorfile_format, filter_by, year, value, crs):
read_path = create_path_bucket(
vectorfile_format=vectorfile_format,
filter_by=filter_by,
year=year,
value=value,
crs=crs,
{
"vectorfile_format": vectorfile_format,
"filter_by": filter_by,
"year": year,
"value": value,
"crs": crs,
}
)
return fs.open(read_path, mode="r")

Expand All @@ -781,13 +734,15 @@ def write_vectorfile_from_s3(
"""

read_path = create_path_bucket(
vectorfile_format=vectorfile_format,
filter_by=filter_by,
year=year,
value=value,
crs=crs,
provider=provider,
source=source,
{
"vectorfile_format": vectorfile_format,
"filter_by": filter_by,
"year": year,
"value": value,
"crs": crs,
"provider": provider,
"source": source,
}
)

fs.download(read_path, filename)
Expand Down Expand Up @@ -939,8 +894,7 @@ def create_nested_topojson(path):
for couple in croisement_filter_by_borders_flat:
borders = couple[0]
filter_by = couple[1]
list_output[borders] = create_territories(
borders=borders, filter_by=filter_by)
list_output[borders] = create_territories(borders=borders, filter_by=filter_by)

topo = Topology(
data=[
Expand Down
2 changes: 2 additions & 0 deletions cartiflette/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
official_epsg_codes,
)
from .keep_subset_geopandas import keep_subset_geopandas
from .create_path_bucket import create_path_bucket
from .hash import hash_file
from .dict_update import deep_dict_update

Expand All @@ -22,4 +23,5 @@
"keep_subset_geopandas",
"hash_file",
"deep_dict_update",
"create_path_bucket"
]
48 changes: 48 additions & 0 deletions cartiflette/utils/create_path_bucket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Module for communication with Minio S3 Storage
"""

from typing import Dict, Union

BUCKET = "projet-cartiflette"
PATH_WITHIN_BUCKET = "diffusion/shapefiles-test1"
ENDPOINT_URL = "https://minio.lab.sspcloud.fr"

# CREATE STANDARDIZED PATHS ------------------------


def create_path_bucket(config: Dict[str, Union[str, int, float]]) -> str:
"""
This function creates a file path for a vector file within a specified bucket.
Parameters:
config (Dict[str, Union[str, int, float]]): A dictionary containing vector file parameters.
Returns:
str: The complete file path for the vector file that will be used to read
or write when interacting with S3 storage.
"""

bucket = config.get("bucket", BUCKET)
path_within_bucket = config.get("path_within_bucket", PATH_WITHIN_BUCKET)
provider = config.get("provider", "IGN")
source = config.get("source", "EXPRESS-COG-TERRITOIRE")
vectorfile_format = config.get("vectorfile_format", "geojson")
borders = config.get("borders", "COMMUNE")
filter_by = config.get("filter_by", "region")
year = config.get("year", "2022")
value = config.get("value", "28")
crs = config.get("crs", 2154)

write_path = f"{bucket}/{path_within_bucket}"
write_path = f"{write_path}/{year=}"
write_path = f"{write_path}/administrative_level={borders}"
write_path = f"{write_path}/{crs=}"
write_path = f"{write_path}/{filter_by}={value}/{vectorfile_format=}"
write_path = f"{write_path}/{provider=}/{source=}"
write_path = f"{write_path}/raw.{vectorfile_format}"
write_path = write_path.replace("'", "")

if vectorfile_format == "shp":
write_path = write_path.rsplit("/", maxsplit=1)[0] + "/"

return write_path
19 changes: 18 additions & 1 deletion tests/mockups.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
FILESIZE_DUMMY,
CONTENT_DUMMY,
)
from cartiflette.utils import (
create_path_bucket
)
from cartiflette.download import (
Dataset,
Dataset
)

logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -116,3 +119,17 @@ def mock_get(self, url, *args, **kwargs):

monkeypatch.setattr(requests.Session, "head", mock_head)
monkeypatch.setattr(requests.Session, "get", mock_get)


@pytest.mark.parametrize(
"config, expected_path",
[
({"bucket": "my_bucket"}, "my_bucket/PATH_WITHIN_BUCKET/2022/administrative_level=COMMUNE/2154/region=28/vectorfile_format=geojson/provider=IGN/source=EXPRESS-COG-TERRITOIRE/raw.geojson"),
({"vectorfile_format": "shp"}, "BUCKET/PATH_WITHIN_BUCKET/2022/administrative_level=COMMUNE/2154/region=28/vectorfile_format=shp/provider=IGN/source=EXPRESS-COG-TERRITOIRE/"),
({"borders": "DEPARTEMENT", "filter_by": "REGION", "year": "2023", "value": "42", "crs": 4326}, "BUCKET/PATH_WITHIN_BUCKET/2023/administrative_level=DEPARTEMENT/4326/REGION=42/geojson/IGN/EXPRESS-COG-TERRITOIRE/raw.geojson"),
({"path_within_bucket": "data", "vectorfile_format": "gpkg"}, "BUCKET/data/2022/administrative_level=COMMUNE/2154/region=28/gpkg/IGN/EXPRESS-COG-TERRITOIRE/raw.gpkg"),
],
)
def test_create_path_bucket(config, expected_path):
result = create_path_bucket(config)
assert result == expected_path
23 changes: 23 additions & 0 deletions tests/test_create_path_bucket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pytest
from cartiflette.utils import (
create_path_bucket
)
# Import create_path_bucket function and VectorFileConfig here (if not already imported)

# Define some test cases with different configurations
@pytest.mark.parametrize(
"config, expected_path",
[
({"bucket": "my_bucket"}, 'my_bucket/diffusion/shapefiles-test1/year=2022/administrative_level=COMMUNE/crs=2154/region=28/vectorfile_format=geojson/provider=IGN/source=EXPRESS-COG-TERRITOIRE/raw.geojson'),
({"vectorfile_format": "shp"}, 'projet-cartiflette/diffusion/shapefiles-test1/year=2022/administrative_level=COMMUNE/crs=2154/region=28/vectorfile_format=shp/provider=IGN/source=EXPRESS-COG-TERRITOIRE/'),
({"borders": "DEPARTEMENT", "filter_by": "REGION", "year": "2023", "value": "42", "crs": 4326}, 'projet-cartiflette/diffusion/shapefiles-test1/year=2023/administrative_level=DEPARTEMENT/crs=4326/REGION=42/vectorfile_format=geojson/provider=IGN/source=EXPRESS-COG-TERRITOIRE/raw.geojson'),
({"path_within_bucket": "data", "vectorfile_format": "gpkg"}, 'projet-cartiflette/data/year=2022/administrative_level=COMMUNE/crs=2154/region=28/vectorfile_format=gpkg/provider=IGN/source=EXPRESS-COG-TERRITOIRE/raw.gpkg'),
],
)
def test_create_path_bucket(config, expected_path):
result = create_path_bucket(config)
assert result == expected_path

# Run the tests
if __name__ == "__main__":
pytest.main()

0 comments on commit 80b8a5a

Please sign in to comment.