Skip to content

Commit

Permalink
Refacto s3 (#75)
Browse files Browse the repository at this point in the history
* Add path_within_buckety

* Utilise MasterScrapper pour duplicate_vectorfile_ign

* Ajoute une fonction pour écrire avec S3 le md5

* Add support for s3fs access

* possibility to load a .env file with python-dotenv (keys = token, key, secret)
* black formatting

* fix logger call

* fix undefined name 'logger'
* add black formatting to utils.dict_update.py

* Samll refacto Dataset

* reset update_json_md5 as a Dataset method;
* add fs argument for instanciation of Dataset;
* fixed Dataset docstring;
* fix bug on Dataset if downloaded prevented because of md5 match
* temporary fix in s3/s3.py of multiple s3fs creation;
* fix duplicate_vectorfile_ign when file already uptodate on s3

* Move constants creation to package init

* Update download.py

* Update s3.py

* Update __init__.py

* Update misc.write_s3

* Update docstrings + notes

* Notes/TODO sur s3

* Fix exception on missing file in json

* Update write_s3.py

* add logging configuration

* Update write_s3.py

* reset os.chdir('cartiflette') just in case

* Move utils from s3

refactorize functions to get path (both from web access or from inside s3)

* Move public functions into ad hoc subpackage

* Fix typo

* Start refacto of s3

* Update dev.py

Black formatting

* Fix typo

* Default year in download.dev

* Update download.py

Fix default year in download.download.py

* Set current year as default everywhere

* Cleanup corrupt files after download

* Unfinished refactorization

* Remove geometry sanitations

* Remove unecessary functions in s3

* Fix mockups _get_last_md5

* Add magic file detection and CachedSession

* Update .gitignore

* Add CSV support (COG Insee)

* Update sources.yaml

* Update download.py

* Spec custom filetype for output

* Create csv_magic.py

utility for unknown csv reading

* RecRefacto download

Use requests-cache
Refacto yaml
Rename "field" argument in yaml to "territory"
Handle zip
Handle nested zip/7zip
Handle CSV/DBF pattern (not only shapefiles)
Refacto tests with CachedSession patching
Split download on multiple files (download, scraper, dataset)

* Add poetry and pytest to CI

* Set os-specific dependency

* Fix check test

* Add incomplete s3 refacto for building purpose

* Add feedback to test

* Fix proxy error on github tests

* Jobs' names differentiation

* Cleanup unused files since poetry's usage

* Fix copy/paste duplicates

* Merge / upgrade standard patchs on bucket

Set a config file which centralize all constants which relates to s3fs

* Move create_path_bucket test to separate test

* Full download pipeline

* Fix bug on pipeline with year as int

* add configuration option for tqdm

* Update config.py

* Recreate base gedataframes directly from s3

* Remove dev

* Refactorize s3 (for a start...)

* Fix _download_sources import in tests

---------

Co-authored-by: linogaliana <[email protected]>
Co-authored-by: thomas.grandjean <[email protected]>
  • Loading branch information
3 people authored Sep 29, 2023
1 parent 80b8a5a commit 1c90dd3
Show file tree
Hide file tree
Showing 34 changed files with 3,614 additions and 2,468 deletions.
20 changes: 12 additions & 8 deletions .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Test Python package
on: [push]

jobs:
build-linux:
testing:
runs-on: ubuntu-latest
strategy:
max-parallel: 5
Expand All @@ -14,16 +14,20 @@ jobs:
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Add libmagic for python-magic on linux
run: sudo apt-get install libmagic1
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install .
poetry install --without dev
poetry add pytest
- name: Test import
run: |
export AWS_ACCESS_KEY_ID=${{ secrets.S3_ACCESS_KEY }}
export AWS_SECRET_ACCESS_KEY=${{ secrets.S3_SECRET_KEY }}
python example/download.py
# - name: Test with pytest
# run: |
# conda install pytest
# pytest
# python example/download.py
- name: Test with pytest
run: |
poetry run pytest
9 changes: 6 additions & 3 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Lint Python package
on: [push]

jobs:
build-linux:
lint-checking:
runs-on: ubuntu-latest
strategy:
max-parallel: 5
Expand All @@ -14,10 +14,13 @@ jobs:
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Add libmagic for python-magic on linux
run: sudo apt-get install libmagic1
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install .
poetry install --without dev
- name: Lint with flake8
run: |
cd cartiflette
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,6 @@ dmypy.json
# Setuptools vs. poetry
*.lock
.toml

*.sqlite
*.sqlite*
15 changes: 12 additions & 3 deletions cartiflette/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
from .utils import *
from .download import *
from .s3 import *
from cartiflette.config import (
BUCKET,
PATH_WITHIN_BUCKET,
ENDPOINT_URL,
FS,
THREADS_DOWNLOAD,
LEAVE_TQDM,
)
from cartiflette.constants import REFERENCES, DOWNLOAD_PIPELINE_ARGS
from cartiflette.utils import *
from cartiflette.download import *
from cartiflette.s3 import *
24 changes: 24 additions & 0 deletions cartiflette/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from dotenv import load_dotenv
import os
import s3fs

load_dotenv()

BUCKET = "projet-cartiflette"
PATH_WITHIN_BUCKET = "diffusion/shapefiles-test4"
ENDPOINT_URL = "https://minio.lab.sspcloud.fr"

kwargs = {}
for key in ["token", "secret", "key"]:
try:
kwargs[key] = os.environ[key]
except KeyError:
continue
FS = s3fs.S3FileSystem(client_kwargs={"endpoint_url": ENDPOINT_URL}, **kwargs)

THREADS_DOWNLOAD = 5
# Nota : each thread may also span the same number of children threads;
# set to 1 for debugging purposes (will deactivate multithreading)

LEAVE_TQDM = False
74 changes: 74 additions & 0 deletions cartiflette/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-

import geopandas as gpd
import logging
from shapely.geometry import box


logger = logging.getLogger(__name__)

REFERENCES = [
# use : https://boundingbox.klokantech.com/
{"location": "metropole", "geometry": box(-5.45, 41.26, 9.83, 51.31)},
{"location": "guyane", "geometry": box(-54.6, 2.11, -51.5, 5.98)},
{
"location": "martinique",
"geometry": box(-61.4355, 14.2217, -60.6023, 15.0795),
},
{
"location": "guadeloupe",
"geometry": box(-62.018, 15.6444, -60.792, 16.714),
},
{
"location": "reunion",
"geometry": box(55.0033, -21.5904, 56.0508, -20.6728),
},
{
"location": "mayotte",
"geometry": box(44.7437, -13.2733, 45.507, -12.379),
},
{
"location": "saint_pierre_et_miquelon",
"geometry": box(-56.6975, 46.5488, -55.9066, 47.3416),
},
]

REFERENCES = gpd.GeoDataFrame(REFERENCES, crs=4326)

DOWNLOAD_PIPELINE_ARGS = {
"ADMIN-EXPRESS": [
"IGN",
"ADMINEXPRESS",
"EXPRESS-COG-TERRITOIRE",
[
"guadeloupe",
"martinique",
"guyane",
"reunion",
"mayotte",
"metropole",
],
],
"BDTOPO": ["IGN", "BDTOPO", "ROOT", "france_entiere"],
"IRIS": ["IGN", "CONTOUR-IRIS", "ROOT", None],
"COG": [
"Insee",
"COG",
[
"COMMUNE",
"CANTON",
"ARRONDISSEMENT",
"DEPARTEMENT",
"REGION",
"COLLECTIVITE",
"PAYS",
],
"france_entiere",
],
"BV 2022": ["Insee", "BV", "FondsDeCarte_BV_2022", "france_entiere"],
"BV 2012": ["Insee", "BV", "FondsDeCarte_BV_2012", "france_entiere"],
}

# EXPRESS-COG ?
# EXPRESS-COG-CARTO-TERRITOIRE ?
# EXPRESS-COG-CARTO ?
38 changes: 9 additions & 29 deletions cartiflette/download/__init__.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,14 @@
from .dev import (
# create_url_adminexpress,
get_vectorfile_ign,
# get_administrative_level_available_ign,
store_vectorfile_ign,
get_vectorfile_communes_arrondissement,
# get_BV,
get_cog_year,
)
# from cartiflette.download.dev import (
# get_vectorfile_communes_arrondissement,
# # get_BV,
# )


from .download import (
Dataset,
BaseScraper,
HttpScraper,
FtpScraper,
MasterScraper,
download_sources,
from cartiflette.download.pipeline import (
download_all,
)


__all__ = [
# "create_url_adminexpress",
"get_vectorfile_ign",
# "get_administrative_level_available_ign",
"store_vectorfile_ign",
"get_vectorfile_communes_arrondissement",
# "get_BV",
"get_cog_year",
"Dataset",
"BaseScraper",
"HttpScraper",
"FtpScraper",
"MasterScraper",
"download_sources",
"download_all",
]
Loading

0 comments on commit 1c90dd3

Please sign in to comment.