Skip to content

Commit

Permalink
Test new httpfs-sync and concurrent download (#170)
Browse files Browse the repository at this point in the history
* Test new httpfs-sync and concurrent download

* Update requirements.txt

* Update iids_pr.yaml

* Update recipe.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update config-pgf-runner-leap-dataflow.json

* Update config-pgf-runner-leap-dataflow-pr.json

* Update iids_pr.yaml

* reactivate high res mip iids

* Update iids.yaml

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
jbusecke and pre-commit-ci[bot] authored Jun 4, 2024
1 parent a9682b7 commit 661839a
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 21 deletions.
4 changes: 0 additions & 4 deletions configs/config-pgf-runner-leap-dataflow-pr.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,5 @@
"TargetStorage": {
"fsspec_class": "gcsfs.GCSFileSystem",
"root_path": "gs://leap-scratch/data-library/cmip6-pgf-ingestion/pr/{job_name}"
},
"InputCacheStorage": {
"fsspec_class": "gcsfs.GCSFileSystem",
"root_path": "gs://leap-scratch/data-library/cmip6-pgf-ingestion/cache"
}
}
4 changes: 0 additions & 4 deletions configs/config-pgf-runner-leap-dataflow.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,5 @@
"TargetStorage": {
"fsspec_class": "gcsfs.GCSFileSystem",
"root_path": "gs://leap-scratch/data-library/cmip6-pgf-ingestion/jobs/{job_name}"
},
"InputCacheStorage": {
"fsspec_class": "gcsfs.GCSFileSystem",
"root_path": "gs://leap-scratch/data-library/cmip6-pgf-ingestion/cache"
}
}
16 changes: 8 additions & 8 deletions feedstock/iids.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# from https://github.com/Timh37/CMIP6cex/issues/2
# - "CMIP6.*.*.*.[historical, ssp245, ssp585].*.day.[psl, pr, sfcWind].*.*"
# # from https://github.com/leap-stc/cmip6-leap-feedstock/issues/144
# - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*'
# - 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*'
# - 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*'
# from https://github.com/leap-stc/cmip6-leap-feedstock/issues/144
- 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*'
- 'CMIP6.HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.[hist-1950,highres-future].r1i1p2f1.6hrPlevPt.[vas,uas,psl].gr.*'
- 'CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HM.[hist-1950,highres-future].r1i1p1f1.[3hr, E3hr].[vas,uas,psl].gn.*'
# From https://github.com/leap-stc/cmip6-leap-feedstock/issues/116
- "CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.historical.r1i1p1f1.Amon.tas.gn.*"
# from https://github.com/pangeo-forge/cmip6-feedstock/issues/22
Expand All @@ -14,8 +14,8 @@
- "CMIP6.*.*.*.[historical, ssp126, ssp245, ssp585].*.SImon.[sifb,siitdthick].*.*"
# for CMIP6 pco2 testbed (these got stuck in the client somehow)
- "CMIP6.*.*.*.[historical, ssp245].*.Omon.[spco2, dpco2,tos,sos,chl,mlotst].*.*"
# # from https://github.com/leap-stc/cmip6-leap-feedstock/issues/72
# - "CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HH.[highres-future,hist-1950].r1i1p1f1.Omon.[thetao,so].gn.*"
# from https://github.com/leap-stc/cmip6-leap-feedstock/issues/72
- "CMIP6.HighResMIP.MOHC.HadGEM3-GC31-HH.[highres-future,hist-1950].r1i1p1f1.Omon.[thetao,so].gn.*"
# # From https://github.com/leap-stc/cmip6-leap-feedstock/issues/101 (waiting until we have better download logic)
# - "CMIP6.CMIP.CSIRO-ARCCSS.ACCESS-CM2.historical.r4i1p1f1.3hr.pr.gn.v20210607"
# - "CMIP6.CMIP.CSIRO-ARCCSS.ACCESS-CM2.historical.r4i1p1f1.6hrLev.hus.gn.v20210921"
Expand Down Expand Up @@ -61,5 +61,5 @@
# - "CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.3hr.pr.gn.v20201013"
# - "CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.6hrLev.ta.gn.v20201112"
# - "CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.6hrLev.hus.gn.v202011"
# # From https://github.com/leap-stc/cmip6-leap-feedstock/issues/123
# - "CMIP6.HighResMIP*.*.[highresSST-present,hist-1950].*.Amon.[pr, ts].gn.*"
# From https://github.com/leap-stc/cmip6-leap-feedstock/issues/123
- "CMIP6.HighResMIP.*.*.[highresSST-present,hist-1950].*.Amon.[pr, ts].gn.*"
1 change: 1 addition & 0 deletions feedstock/iids_pr.yaml
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
- "CMIP6.*.*.[CNRM-CM6-1,CanESM5].historical.*.Omon.[tos, so, o2].*.*"
# - 'CMIP6.HighResMIP.CMCC.CMCC-CM2-VHR4.[hist-1950,highres-future].r1i1p1f1.6hrPlevPt.[vas,uas,psl].gn.*'
17 changes: 16 additions & 1 deletion feedstock/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,15 @@
StoreToZarr,
ConsolidateMetadata,
ConsolidateDimensionCoordinates,
CheckpointFileTransfer,
)
from pangeo_forge_recipes.storage import CacheFSSpecTarget

import logging
import asyncio
import os
import yaml
import gcsfs

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -146,12 +150,23 @@ async def get_recipe_inputs():
## Create the recipes
recipes = {}

cache_target = CacheFSSpecTarget(
fs=gcsfs.GCSFileSystem(),
root_path="gs://leap-scratch/data-library/cmip6-pgf-ingestion/cache",
)

for iid, data in recipe_data.items():
urls = get_sorted_http_urls_from_iid_dict(data)
pattern = pattern_from_file_sequence(urls, concat_dim="time")
recipes[iid] = (
f"Creating {iid}" >> beam.Create(pattern.items())
| OpenURLWithFSSpec(max_concurrency=8)
| CheckpointFileTransfer(
transfer_target=cache_target,
max_executors=2,
concurrency_per_executor=8,
fsspec_sync_patch=True,
)
| OpenURLWithFSSpec(cache=None, fsspec_sync_patch=True)
# do not specify file type to accomodate both ncdf3 and ncdf4
| OpenWithXarray(xarray_open_kwargs={"use_cftime": True})
| Preprocessor()
Expand Down
6 changes: 2 additions & 4 deletions feedstock/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@ leap-data-management-utils==0.0.12
#pangeo-forge-esgf==0.2.0
git+https://github.com/jbusecke/pangeo-forge-esgf.git@new-request-scheme
dynamic-chunks==0.0.3
git+https://github.com/ranchodeluxe/xarray@ranchodeluxe-patch-1#egg=xarray
git+https://github.com/ranchodeluxe/rioxarray
git+https://github.com/ranchodeluxe/datatree@main#egg=xarray-datatree
git+https://github.com/pangeo-forge/pangeo-forge-recipes@jb/xarray-hack #see @gc/cached_disabled but with cache
git+https://github.com/pangeo-forge/pangeo-forge-recipes@feature/concurrency-control
zarr==2.16.1
gcsfs
apache-beam[gcp]
httpfs-sync

0 comments on commit 661839a

Please sign in to comment.