Skip to content

Commit

Permalink
Download ECCO files using Downloads and .netrc files (#281)
Browse files Browse the repository at this point in the history
* this should work

* better naming

* only one download

* add download test

* joinpath does not work on windows

* test also downloading the bathymetry

* test dowloading bathymetry

* restore tests

* gracefull downloading

* try it now

* fix typo

* make sure we delete the previous data before testing the download

* should work

* test distributed downloading

* Update test_distributed_utils.jl

* fix the download

* generalize the downloader

* generalize more

* generalize filename

* download_progress is part of the downloading utilities

* better docstring

* better docstring

* change docstring

* fix tests

* distribute among tasks

* whoops added wrong file

* correct looping

* bugfix
  • Loading branch information
simone-silvestri authored Dec 10, 2024
1 parent b73c67c commit 6d3d822
Show file tree
Hide file tree
Showing 12 changed files with 149 additions and 52 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ jobs:
arch:
- x64
include:
- os: windows-latest
arch: x86
version: '1.10'
- os: macOS-latest
arch: arm64
version: '1.10'
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ docs/src/literated/
*.svg
*.gif

# Password files
*.netrc

# File generated by Pkg, the package manager, based on a corresponding Project.toml
# It records a fixed state of all packages used by the project. As such, it should not be
# committed for packages, but should be committed for applications that require a static
Expand Down
15 changes: 4 additions & 11 deletions src/Bathymetry.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,17 +93,10 @@ function regrid_bathymetry(target_grid;
major_basins = Inf) # Allow an `Inf` number of ``lakes''

filepath = joinpath(dir, filename)
fileurl = joinpath(url, filename)

@root begin # perform all this only on rank 0, aka the "root" rank
if !isfile(filepath)
try
Downloads.download(fileurl, filepath; progress=download_progress, verbose=true)
catch
cmd = `wget --no-check-certificate -O $filepath $fileurl`
@root run(cmd)
end
end
fileurl = url * "/" * filename # joinpath on windows creates the wrong url

@root if !isfile(filepath) # perform all this only on rank 0, aka the "root" rank
Downloads.download(fileurl, filepath; progress=download_progress)
end

dataset = Dataset(filepath)
Expand Down
47 changes: 47 additions & 0 deletions src/DataWrangling/DataWrangling.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ using Oceananigans: pretty_filesize, location
using Oceananigans.Utils: launch!
using KernelAbstractions: @kernel, @index

#####
##### Downloading utilities
#####

next_fraction = Ref(0.0)
download_start_time = Ref(time_ns())

Expand Down Expand Up @@ -49,6 +53,49 @@ function download_progress(total, now; filename="")
return nothing
end

"""
netrc_downloader(username, password, machine, dir)
Create a downloader that uses a netrc file to authenticate with the given machine.
This downlader writes the username and password in a file named `auth.netrc` (for Unix) and
`auth_netrc` (for Windows), located in the directory `dir`.
To avoid leaving the password on disk after the downloader has been used,
it is recommended to initialize the downloader in a temporary directory, which will be removed
after the download is complete.
For example:
```
mktempdir(dir) do tmp
dowloader = netrc_downloader(username, password, machine, tmp)
Downloads.download(fileurl, filepath; downloader)
end
```
"""
function netrc_downloader(username, password, machine, dir)
netrc_file = netrc_permission_file(username, password, machine, dir)
downloader = Downloads.Downloader()
easy_hook = (easy, _) -> Downloads.Curl.setopt(easy, Downloads.Curl.CURLOPT_NETRC_FILE, netrc_file)

downloader.easy_hook = easy_hook
return downloader
end

# Code snippet adapted from https://github.com/evetion/SpaceLiDAR.jl/blob/master/src/utils.jl#L150
function netrc_permission_file(username, password, machine, dir)
if Sys.iswindows()
filepath = joinpath(dir, "auth_netrc")
else
filepath = joinpath(dir, "auth.netrc")
end

open(filepath, "a") do f
write(f, "machine $machine login $username password $password\n")
end

return filepath
end

#####
##### FieldTimeSeries utilities
#####
Expand Down
2 changes: 1 addition & 1 deletion src/DataWrangling/ECCO/ECCO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ export ECCORestoring, LinearlyTaperedPolarMask

using ClimaOcean
using ClimaOcean.DataWrangling
using ClimaOcean.DataWrangling: inpaint_mask!, NearestNeighborInpainting
using ClimaOcean.DataWrangling: inpaint_mask!, NearestNeighborInpainting, download_progress
using ClimaOcean.InitialConditions: three_dimensional_regrid!, interpolate!

using Oceananigans
Expand Down
62 changes: 36 additions & 26 deletions src/DataWrangling/ECCO/ECCO_metadata.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
using CFTime
using Dates
using ClimaOcean.DataWrangling
using ClimaOcean.DataWrangling: netrc_downloader

import Dates: year, month, day

using Base: @propagate_inbounds
using Downloads

import Oceananigans.Fields: set!, location
import Base
Expand Down Expand Up @@ -79,7 +81,6 @@ ECCOMetadata(name::Symbol, date, version=ECCO4Monthly(); dir=download_ECCO_cache
ECCOMetadata(name, date, version, dir)

# Treat ECCOMetadata as an array to allow iteration over the dates.
Base.length(metadata::ECCOMetadata) = length(metadata.dates)
Base.eltype(metadata::ECCOMetadata) = Base.eltype(metadata.dates)

@propagate_inbounds Base.getindex(m::ECCOMetadata, i::Int) = ECCOMetadata(m.name, m.dates[i], m.version, m.dir)
Expand All @@ -100,10 +101,12 @@ Base.last(metadata::ECCOMetadata{<:AbstractCFDateTime}) = metadata
Base.iterate(metadata::ECCOMetadata{<:AbstractCFDateTime}) = (metadata, nothing)
Base.iterate(::ECCOMetadata{<:AbstractCFDateTime}, ::Any) = nothing

Base.length(metadata::ECCOMetadata) = length(metadata.dates)
Base.size(data::ECCOMetadata{<:Any, <:ECCO2Daily}) = (1440, 720, 50, length(data.dates))
Base.size(data::ECCOMetadata{<:Any, <:ECCO2Monthly}) = (1440, 720, 50, length(data.dates))
Base.size(data::ECCOMetadata{<:Any, <:ECCO4Monthly}) = (720, 360, 50, length(data.dates))

Base.length(metadata::ECCOMetadata{<:AbstractCFDateTime}) = 1
Base.size(::ECCOMetadata{<:AbstractCFDateTime, <:ECCO2Daily}) = (1440, 720, 50, 1)
Base.size(::ECCOMetadata{<:AbstractCFDateTime, <:ECCO2Monthly}) = (1440, 720, 50, 1)
Base.size(::ECCOMetadata{<:AbstractCFDateTime, <:ECCO4Monthly}) = (720, 360, 50, 1)
Expand Down Expand Up @@ -144,12 +147,12 @@ short_name(data::ECCOMetadata{<:Any, <:ECCO2Daily}) = ECCO2_short_names[data.n
short_name(data::ECCOMetadata{<:Any, <:ECCO2Monthly}) = ECCO2_short_names[data.name]
short_name(data::ECCOMetadata{<:Any, <:ECCO4Monthly}) = ECCO4_short_names[data.name]

metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO2Daily}) = joinpath(prefix, short_name(m), metadata_filename(m))
metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO2Monthly}) = joinpath(prefix, short_name(m), metadata_filename(m))
metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO2Daily}) = prefix * "/" * short_name(m) * "/" * metadata_filename(m)
metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO2Monthly}) = prefix * "/" * short_name(m) * "/" * metadata_filename(m)

function metadata_url(prefix, m::ECCOMetadata{<:Any, <:ECCO4Monthly})
year = string(Dates.year(m.dates))
return joinpath(prefix, short_name(m), year, metadata_filename(m))
return prefix * "/" * short_name(m) * "/" * year * "/" * metadata_filename(m)
end

location(data::ECCOMetadata) = ECCO_location[data.name]
Expand Down Expand Up @@ -217,30 +220,37 @@ function download_dataset(metadata::ECCOMetadata; url = urls(metadata))
username = get(ENV, "ECCO_USERNAME", nothing)
password = get(ENV, "ECCO_PASSWORD", nothing)
dir = metadata.dir

@distribute for metadatum in metadata # Distribute the download among ranks if MPI is initialized

fileurl = metadata_url(url, metadatum)
filepath = metadata_path(metadatum)

if !isfile(filepath)
instructions_msg = "\n See ClimaOcean.jl/src/ECCO/README.md for instructions."
if isnothing(username)
msg = "Could not find the ECCO_PASSWORD environment variable. \
See ClimaOcean.jl/src/ECCO/README.md for instructions on obtaining \
and setting your ECCO_USERNAME and ECCO_PASSWORD." * instructions_msg
throw(ArgumentError(msg))
elseif isnothing(password)
msg = "Could not find the ECCO_PASSWORD environment variable. \
See ClimaOcean.jl/src/ECCO/README.md for instructions on obtaining \
and setting your ECCO_USERNAME and ECCO_PASSWORD." * instructions_msg
throw(ArgumentError(msg))

# Create a temporary directory to store the .netrc file
# The directory will be deleted after the download is complete
@root mktempdir(dir) do tmp

# Write down the username and password in a .netrc file
downloader = netrc_downloader(username, password, "ecco.jpl.nasa.gov", tmp)

asyncmap(metadata, ntasks=10) do metadatum # Distribute the download among tasks

fileurl = metadata_url(url, metadatum)
filepath = metadata_path(metadatum)

if !isfile(filepath)
instructions_msg = "\n See ClimaOcean.jl/src/ECCO/README.md for instructions."
if isnothing(username)
msg = "Could not find the ECCO_PASSWORD environment variable. \
See ClimaOcean.jl/src/ECCO/README.md for instructions on obtaining \
and setting your ECCO_USERNAME and ECCO_PASSWORD." * instructions_msg
throw(ArgumentError(msg))
elseif isnothing(password)
msg = "Could not find the ECCO_PASSWORD environment variable. \
See ClimaOcean.jl/src/ECCO/README.md for instructions on obtaining \
and setting your ECCO_USERNAME and ECCO_PASSWORD." * instructions_msg
throw(ArgumentError(msg))
end

Downloads.download(fileurl, filepath; downloader, progress=download_progress)
end

cmd = `wget --http-user=$(username) --http-passwd=$(password) --directory-prefix=$dir $fileurl`
run(cmd)
end
end

return nothing
end
2 changes: 1 addition & 1 deletion src/DataWrangling/JRA55.jl
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ function JRA55_field_time_series(variable_name;

# Note, we don't re-use existing jld2 files.
@root begin
isfile(filepath) || download(url, filepath)
isfile(filepath) || download(url, filepath; progress=download_progress)
isfile(jld2_filepath) && rm(jld2_filepath)
end

Expand Down
17 changes: 9 additions & 8 deletions src/distributed_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ using MPI
#####

# Utilities to make the macro work importing only ClimaOcean and not MPI
mpi_initialized() = MPI.Initialized()
mpi_rank(comm) = MPI.Comm_rank(comm)
mpi_size(comm) = MPI.Comm_size(comm)
global_barrier(comm) = MPI.Barrier(comm)
mpi_initialized() = MPI.Initialized()
mpi_rank(comm) = MPI.Comm_rank(comm)
mpi_size(comm) = MPI.Comm_size(comm)
global_barrier(comm) = MPI.Barrier(comm)
global_communicator() = MPI.COMM_WORLD

"""
@root communicator exs...
Expand All @@ -35,7 +36,7 @@ end

macro root(exp)
command = quote
@root MPI.COMM_WORLD $exp
@root ClimaOcean.global_communicator() $exp
end
return esc(command)
end
Expand Down Expand Up @@ -67,7 +68,7 @@ end

macro onrank(rank, exp)
command = quote
@onrank MPI.COMM_WORLD $rank $exp
@onrank ClimaOcean.global_communicator() $rank $exp
end
return esc(command)
end
Expand Down Expand Up @@ -116,7 +117,7 @@ end

macro distribute(exp)
command = quote
@distribute MPI.COMM_WORLD $exp
@distribute ClimaOcean.global_communicator() $exp
end
return esc(command)
end
Expand Down Expand Up @@ -149,7 +150,7 @@ end

macro handshake(exp)
command = quote
@handshake MPI.COMM_WORLD $exp
@handshake ClimaOcean.global_communicator() $exp
end
return esc(command)
end
6 changes: 3 additions & 3 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ if test_group == :init || test_group == :all
CUDA.set_runtime_version!(v"12.6"; local_toolkit = true)
CUDA.precompile_runtime()

####
#### Download bathymetry data
####
###
### Download bathymetry data
###

download_bathymetry()

Expand Down
7 changes: 5 additions & 2 deletions test/runtests_setup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ using Oceananigans.Architectures: architecture, on_architecture
using Oceananigans.OutputReaders: interpolate!

using ClimaOcean
using ClimaOcean.Bathymetry: download_bathymetry_cache
using CFTime
using Dates

Expand All @@ -28,13 +29,15 @@ temperature_metadata = ECCOMetadata(:temperature, dates)
salinity_metadata = ECCOMetadata(:salinity, dates)

# Fictitious grid that triggers bathymetry download
function download_bathymetry()
function download_bathymetry(; dir = download_bathymetry_cache,
filename = "ETOPO_2022_v1_60s_N90W180_surface.nc")

grid = LatitudeLongitudeGrid(size = (10, 10, 1),
longitude = (0, 100),
latitude = (0, 50),
z = (-6000, 0))

bottom = regrid_bathymetry(grid)
bottom = regrid_bathymetry(grid; dir, filename)

return nothing
end
15 changes: 15 additions & 0 deletions test/test_distributed_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ include("runtests_setup.jl")
using MPI
MPI.Init()

using ClimaOcean.ECCO: download_dataset, metadata_path
using CFTime
using Dates

@testset begin
rank = MPI.Comm_rank(MPI.COMM_WORLD)

Expand Down Expand Up @@ -47,6 +51,7 @@ MPI.Init()
@onrank 3 begin
@test a == [4, 8]
end


split_comm = MPI.Comm_split(MPI.COMM_WORLD, rank % 2, rank)

Expand All @@ -59,3 +64,13 @@ MPI.Init()
@onrank split_comm 0 @test a == [1, 3, 5, 7, 9]
@onrank split_comm 1 @test a == [2, 4, 6, 8, 10]
end

@testset "Distributed ECCO download" begin
dates = DateTimeProlepticGregorian(1992, 1, 1) : Month(1) : DateTimeProlepticGregorian(1994, 4, 1)
metadata = ECCOMetadata(:u_velocity; dates)
download_dataset(metadata)

@root for metadatum in metadata
@test isfile(metadata_path(metadatum))
end
end
22 changes: 22 additions & 0 deletions test/test_downloading.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@
include("runtests_setup.jl")

using ClimaOcean.ECCO: metadata_path

@testset "Availability of JRA55 data" begin
@info "Testing that we can download all the JRA55 data..."
for name in ClimaOcean.DataWrangling.JRA55.JRA55_variable_names

fts = ClimaOcean.JRA55.JRA55_field_time_series(name; time_indices=2:3)
end
end

@testset "Availability of ECCO data" begin
@info "Testing that we can download ECCO data..."
for variable in keys(ClimaOcean.ECCO.ECCO4_short_names)
metadata = ECCOMetadata(variable)
filepath = metadata_path(metadata)
isfile(filepath) && rm(filepath; force=true)
ClimaOcean.ECCO.download_dataset(metadata)
end
end

@testset "Availability of the Bathymetry" begin
@info "Testing that we can download the bathymetry..."
dir="./"
filename="ETOPO_2022_v1_60s_N90W180_surface.nc"
filepath=joinpath(dir, filename)
isfile(filepath) && rm(filepath; force=true)
download_bathymetry(; dir, filename)
end

0 comments on commit 6d3d822

Please sign in to comment.