From 6d3d8221f760781372631c61fad4fd13e13603dd Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 10 Dec 2024 21:37:50 +0100 Subject: [PATCH] Download ECCO files using `Downloads` and `.netrc` files (#281) * this should work * better naming * only one download * add download test * joinpath does not work on windows * test also downloading the bathymetry * test dowloading bathymetry * restore tests * gracefull downloading * try it now * fix typo * make sure we delete the previous data before testing the download * should work * test distributed downloading * Update test_distributed_utils.jl * fix the download * generalize the downloader * generalize more * generalize filename * download_progress is part of the downloading utilities * better docstring * better docstring * change docstring * fix tests * distribute among tasks * whoops added wrong file * correct looping * bugfix --- .github/workflows/ci.yml | 3 ++ .gitignore | 3 ++ src/Bathymetry.jl | 15 ++---- src/DataWrangling/DataWrangling.jl | 47 +++++++++++++++++++ src/DataWrangling/ECCO/ECCO.jl | 2 +- src/DataWrangling/ECCO/ECCO_metadata.jl | 62 ++++++++++++++----------- src/DataWrangling/JRA55.jl | 2 +- src/distributed_utils.jl | 17 +++---- test/runtests.jl | 6 +-- test/runtests_setup.jl | 7 ++- test/test_distributed_utils.jl | 15 ++++++ test/test_downloading.jl | 22 +++++++++ 12 files changed, 149 insertions(+), 52 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ae713c5b..7aebb4ad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,6 +19,9 @@ jobs: arch: - x64 include: + - os: windows-latest + arch: x86 + version: '1.10' - os: macOS-latest arch: arm64 version: '1.10' diff --git a/.gitignore b/.gitignore index 997ed814..3c3ca28d 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,9 @@ docs/src/literated/ *.svg *.gif +# Password files +*.netrc + # File generated by Pkg, the package manager, based on a corresponding Project.toml # It records a fixed state of all packages used by the project. As such, it should not be # committed for packages, but should be committed for applications that require a static diff --git a/src/Bathymetry.jl b/src/Bathymetry.jl index 970b49f1..ea9a358a 100644 --- a/src/Bathymetry.jl +++ b/src/Bathymetry.jl @@ -93,17 +93,10 @@ function regrid_bathymetry(target_grid; major_basins = Inf) # Allow an `Inf` number of ``lakes'' filepath = joinpath(dir, filename) - fileurl = joinpath(url, filename) - - @root begin # perform all this only on rank 0, aka the "root" rank - if !isfile(filepath) - try - Downloads.download(fileurl, filepath; progress=download_progress, verbose=true) - catch - cmd = `wget --no-check-certificate -O $filepath $fileurl` - @root run(cmd) - end - end + fileurl = url * "/" * filename # joinpath on windows creates the wrong url + + @root if !isfile(filepath) # perform all this only on rank 0, aka the "root" rank + Downloads.download(fileurl, filepath; progress=download_progress) end dataset = Dataset(filepath) diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index 62fc00c7..792c07b3 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -13,6 +13,10 @@ using Oceananigans: pretty_filesize, location using Oceananigans.Utils: launch! using KernelAbstractions: @kernel, @index +##### +##### Downloading utilities +##### + next_fraction = Ref(0.0) download_start_time = Ref(time_ns()) @@ -49,6 +53,49 @@ function download_progress(total, now; filename="") return nothing end +""" + netrc_downloader(username, password, machine, dir) + +Create a downloader that uses a netrc file to authenticate with the given machine. +This downlader writes the username and password in a file named `auth.netrc` (for Unix) and +`auth_netrc` (for Windows), located in the directory `dir`. +To avoid leaving the password on disk after the downloader has been used, +it is recommended to initialize the downloader in a temporary directory, which will be removed +after the download is complete. + +For example: + +``` +mktempdir(dir) do tmp + dowloader = netrc_downloader(username, password, machine, tmp) + Downloads.download(fileurl, filepath; downloader) +end +``` +""" +function netrc_downloader(username, password, machine, dir) + netrc_file = netrc_permission_file(username, password, machine, dir) + downloader = Downloads.Downloader() + easy_hook = (easy, _) -> Downloads.Curl.setopt(easy, Downloads.Curl.CURLOPT_NETRC_FILE, netrc_file) + + downloader.easy_hook = easy_hook + return downloader +end + +# Code snippet adapted from https://github.com/evetion/SpaceLiDAR.jl/blob/master/src/utils.jl#L150 +function netrc_permission_file(username, password, machine, dir) + if Sys.iswindows() + filepath = joinpath(dir, "auth_netrc") + else + filepath = joinpath(dir, "auth.netrc") + end + + open(filepath, "a") do f + write(f, "machine $machine login $username password $password\n") + end + + return filepath +end + ##### ##### FieldTimeSeries utilities ##### diff --git a/src/DataWrangling/ECCO/ECCO.jl b/src/DataWrangling/ECCO/ECCO.jl index a02bd07a..96e3415e 100644 --- a/src/DataWrangling/ECCO/ECCO.jl +++ b/src/DataWrangling/ECCO/ECCO.jl @@ -6,7 +6,7 @@ export ECCORestoring, LinearlyTaperedPolarMask using ClimaOcean using ClimaOcean.DataWrangling -using ClimaOcean.DataWrangling: inpaint_mask!, NearestNeighborInpainting +using ClimaOcean.DataWrangling: inpaint_mask!, NearestNeighborInpainting, download_progress using ClimaOcean.InitialConditions: three_dimensional_regrid!, interpolate! using Oceananigans diff --git a/src/DataWrangling/ECCO/ECCO_metadata.jl b/src/DataWrangling/ECCO/ECCO_metadata.jl index 161c73af..b9f78ddb 100644 --- a/src/DataWrangling/ECCO/ECCO_metadata.jl +++ b/src/DataWrangling/ECCO/ECCO_metadata.jl @@ -1,10 +1,12 @@ using CFTime using Dates using ClimaOcean.DataWrangling +using ClimaOcean.DataWrangling: netrc_downloader import Dates: year, month, day using Base: @propagate_inbounds +using Downloads import Oceananigans.Fields: set!, location import Base @@ -79,7 +81,6 @@ ECCOMetadata(name::Symbol, date, version=ECCO4Monthly(); dir=download_ECCO_cache ECCOMetadata(name, date, version, dir) # Treat ECCOMetadata as an array to allow iteration over the dates. -Base.length(metadata::ECCOMetadata) = length(metadata.dates) Base.eltype(metadata::ECCOMetadata) = Base.eltype(metadata.dates) @propagate_inbounds Base.getindex(m::ECCOMetadata, i::Int) = ECCOMetadata(m.name, m.dates[i], m.version, m.dir) @@ -100,10 +101,12 @@ Base.last(metadata::ECCOMetadata{<:AbstractCFDateTime}) = metadata Base.iterate(metadata::ECCOMetadata{<:AbstractCFDateTime}) = (metadata, nothing) Base.iterate(::ECCOMetadata{<:AbstractCFDateTime}, ::Any) = nothing +Base.length(metadata::ECCOMetadata) = length(metadata.dates) Base.size(data::ECCOMetadata{<:Any, <:ECCO2Daily}) = (1440, 720, 50, length(data.dates)) Base.size(data::ECCOMetadata{<:Any, <:ECCO2Monthly}) = (1440, 720, 50, length(data.dates)) Base.size(data::ECCOMetadata{<:Any, <:ECCO4Monthly}) = (720, 360, 50, length(data.dates)) +Base.length(metadata::ECCOMetadata{<:AbstractCFDateTime}) = 1 Base.size(::ECCOMetadata{<:AbstractCFDateTime, <:ECCO2Daily}) = (1440, 720, 50, 1) Base.size(::ECCOMetadata{<:AbstractCFDateTime, <:ECCO2Monthly}) = (1440, 720, 50, 1) Base.size(::ECCOMetadata{<:AbstractCFDateTime, <:ECCO4Monthly}) = (720, 360, 50, 1) @@ -144,12 +147,12 @@ short_name(data::ECCOMetadata{<:Any, <:ECCO2Daily}) = ECCO2_short_names[data.n short_name(data::ECCOMetadata{<:Any, <:ECCO2Monthly}) = ECCO2_short_names[data.name] short_name(data::ECCOMetadata{<:Any, <:ECCO4Monthly}) = ECCO4_short_names[data.name] -metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO2Daily}) = joinpath(prefix, short_name(m), metadata_filename(m)) -metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO2Monthly}) = joinpath(prefix, short_name(m), metadata_filename(m)) +metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO2Daily}) = prefix * "/" * short_name(m) * "/" * metadata_filename(m) +metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO2Monthly}) = prefix * "/" * short_name(m) * "/" * metadata_filename(m) function metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO4Monthly}) year = string(Dates.year(m.dates)) - return joinpath(prefix, short_name(m), year, metadata_filename(m)) + return prefix * "/" * short_name(m) * "/" * year * "/" * metadata_filename(m) end location(data::ECCOMetadata) = ECCO_location[data.name] @@ -217,30 +220,37 @@ function download_dataset(metadata::ECCOMetadata; url = urls(metadata)) username = get(ENV, "ECCO_USERNAME", nothing) password = get(ENV, "ECCO_PASSWORD", nothing) dir = metadata.dir - - @distribute for metadatum in metadata # Distribute the download among ranks if MPI is initialized - - fileurl = metadata_url(url, metadatum) - filepath = metadata_path(metadatum) - - if !isfile(filepath) - instructions_msg = "\n See ClimaOcean.jl/src/ECCO/README.md for instructions." - if isnothing(username) - msg = "Could not find the ECCO_PASSWORD environment variable. \ - See ClimaOcean.jl/src/ECCO/README.md for instructions on obtaining \ - and setting your ECCO_USERNAME and ECCO_PASSWORD." * instructions_msg - throw(ArgumentError(msg)) - elseif isnothing(password) - msg = "Could not find the ECCO_PASSWORD environment variable. \ - See ClimaOcean.jl/src/ECCO/README.md for instructions on obtaining \ - and setting your ECCO_USERNAME and ECCO_PASSWORD." * instructions_msg - throw(ArgumentError(msg)) + + # Create a temporary directory to store the .netrc file + # The directory will be deleted after the download is complete + @root mktempdir(dir) do tmp + + # Write down the username and password in a .netrc file + downloader = netrc_downloader(username, password, "ecco.jpl.nasa.gov", tmp) + + asyncmap(metadata, ntasks=10) do metadatum # Distribute the download among tasks + + fileurl = metadata_url(url, metadatum) + filepath = metadata_path(metadatum) + + if !isfile(filepath) + instructions_msg = "\n See ClimaOcean.jl/src/ECCO/README.md for instructions." + if isnothing(username) + msg = "Could not find the ECCO_PASSWORD environment variable. \ + See ClimaOcean.jl/src/ECCO/README.md for instructions on obtaining \ + and setting your ECCO_USERNAME and ECCO_PASSWORD." * instructions_msg + throw(ArgumentError(msg)) + elseif isnothing(password) + msg = "Could not find the ECCO_PASSWORD environment variable. \ + See ClimaOcean.jl/src/ECCO/README.md for instructions on obtaining \ + and setting your ECCO_USERNAME and ECCO_PASSWORD." * instructions_msg + throw(ArgumentError(msg)) + end + + Downloads.download(fileurl, filepath; downloader, progress=download_progress) end - - cmd = `wget --http-user=$(username) --http-passwd=$(password) --directory-prefix=$dir $fileurl` - run(cmd) end end - + return nothing end diff --git a/src/DataWrangling/JRA55.jl b/src/DataWrangling/JRA55.jl index ce8daa0f..6586a727 100644 --- a/src/DataWrangling/JRA55.jl +++ b/src/DataWrangling/JRA55.jl @@ -391,7 +391,7 @@ function JRA55_field_time_series(variable_name; # Note, we don't re-use existing jld2 files. @root begin - isfile(filepath) || download(url, filepath) + isfile(filepath) || download(url, filepath; progress=download_progress) isfile(jld2_filepath) && rm(jld2_filepath) end diff --git a/src/distributed_utils.jl b/src/distributed_utils.jl index f24d6722..a05612b5 100644 --- a/src/distributed_utils.jl +++ b/src/distributed_utils.jl @@ -6,10 +6,11 @@ using MPI ##### # Utilities to make the macro work importing only ClimaOcean and not MPI -mpi_initialized() = MPI.Initialized() -mpi_rank(comm) = MPI.Comm_rank(comm) -mpi_size(comm) = MPI.Comm_size(comm) -global_barrier(comm) = MPI.Barrier(comm) +mpi_initialized() = MPI.Initialized() +mpi_rank(comm) = MPI.Comm_rank(comm) +mpi_size(comm) = MPI.Comm_size(comm) +global_barrier(comm) = MPI.Barrier(comm) +global_communicator() = MPI.COMM_WORLD """ @root communicator exs... @@ -35,7 +36,7 @@ end macro root(exp) command = quote - @root MPI.COMM_WORLD $exp + @root ClimaOcean.global_communicator() $exp end return esc(command) end @@ -67,7 +68,7 @@ end macro onrank(rank, exp) command = quote - @onrank MPI.COMM_WORLD $rank $exp + @onrank ClimaOcean.global_communicator() $rank $exp end return esc(command) end @@ -116,7 +117,7 @@ end macro distribute(exp) command = quote - @distribute MPI.COMM_WORLD $exp + @distribute ClimaOcean.global_communicator() $exp end return esc(command) end @@ -149,7 +150,7 @@ end macro handshake(exp) command = quote - @handshake MPI.COMM_WORLD $exp + @handshake ClimaOcean.global_communicator() $exp end return esc(command) end diff --git a/test/runtests.jl b/test/runtests.jl index cdeec98d..0c71ec17 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,9 +13,9 @@ if test_group == :init || test_group == :all CUDA.set_runtime_version!(v"12.6"; local_toolkit = true) CUDA.precompile_runtime() - #### - #### Download bathymetry data - #### + ### + ### Download bathymetry data + ### download_bathymetry() diff --git a/test/runtests_setup.jl b/test/runtests_setup.jl index e815da71..f8d83ebd 100644 --- a/test/runtests_setup.jl +++ b/test/runtests_setup.jl @@ -12,6 +12,7 @@ using Oceananigans.Architectures: architecture, on_architecture using Oceananigans.OutputReaders: interpolate! using ClimaOcean +using ClimaOcean.Bathymetry: download_bathymetry_cache using CFTime using Dates @@ -28,13 +29,15 @@ temperature_metadata = ECCOMetadata(:temperature, dates) salinity_metadata = ECCOMetadata(:salinity, dates) # Fictitious grid that triggers bathymetry download -function download_bathymetry() +function download_bathymetry(; dir = download_bathymetry_cache, + filename = "ETOPO_2022_v1_60s_N90W180_surface.nc") + grid = LatitudeLongitudeGrid(size = (10, 10, 1), longitude = (0, 100), latitude = (0, 50), z = (-6000, 0)) - bottom = regrid_bathymetry(grid) + bottom = regrid_bathymetry(grid; dir, filename) return nothing end diff --git a/test/test_distributed_utils.jl b/test/test_distributed_utils.jl index 8e9d8be7..db23d14f 100644 --- a/test/test_distributed_utils.jl +++ b/test/test_distributed_utils.jl @@ -3,6 +3,10 @@ include("runtests_setup.jl") using MPI MPI.Init() +using ClimaOcean.ECCO: download_dataset, metadata_path +using CFTime +using Dates + @testset begin rank = MPI.Comm_rank(MPI.COMM_WORLD) @@ -47,6 +51,7 @@ MPI.Init() @onrank 3 begin @test a == [4, 8] end + split_comm = MPI.Comm_split(MPI.COMM_WORLD, rank % 2, rank) @@ -59,3 +64,13 @@ MPI.Init() @onrank split_comm 0 @test a == [1, 3, 5, 7, 9] @onrank split_comm 1 @test a == [2, 4, 6, 8, 10] end + +@testset "Distributed ECCO download" begin + dates = DateTimeProlepticGregorian(1992, 1, 1) : Month(1) : DateTimeProlepticGregorian(1994, 4, 1) + metadata = ECCOMetadata(:u_velocity; dates) + download_dataset(metadata) + + @root for metadatum in metadata + @test isfile(metadata_path(metadatum)) + end +end diff --git a/test/test_downloading.jl b/test/test_downloading.jl index 7bc53331..9f12846a 100644 --- a/test/test_downloading.jl +++ b/test/test_downloading.jl @@ -1,8 +1,30 @@ include("runtests_setup.jl") +using ClimaOcean.ECCO: metadata_path + @testset "Availability of JRA55 data" begin @info "Testing that we can download all the JRA55 data..." for name in ClimaOcean.DataWrangling.JRA55.JRA55_variable_names + fts = ClimaOcean.JRA55.JRA55_field_time_series(name; time_indices=2:3) end end + +@testset "Availability of ECCO data" begin + @info "Testing that we can download ECCO data..." + for variable in keys(ClimaOcean.ECCO.ECCO4_short_names) + metadata = ECCOMetadata(variable) + filepath = metadata_path(metadata) + isfile(filepath) && rm(filepath; force=true) + ClimaOcean.ECCO.download_dataset(metadata) + end +end + +@testset "Availability of the Bathymetry" begin + @info "Testing that we can download the bathymetry..." + dir="./" + filename="ETOPO_2022_v1_60s_N90W180_surface.nc" + filepath=joinpath(dir, filename) + isfile(filepath) && rm(filepath; force=true) + download_bathymetry(; dir, filename) +end