Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 2-weekly slicing for 2022 #3

Merged
merged 6 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 69 additions & 59 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,72 +24,82 @@ pre-commit install
For making changes, see the [guidance on development](https://github.com/alan-turing-institute/python-project-template?tab=readme-ov-file#setting-up-a-new-project) from the template that generated this project.

## Usage
```bash
 cloudcasting download --help

Usage: cloudcasting download [OPTIONS] START_DATE END_DATE OUTPUT_DIRECTORY

Download a selection of the available EUMETSAT data.
Each calendar year of data within the supplied date range will be saved to a
separate file in the output directory.
Args: start_date: First datetime (inclusive) to download. end_date: Last
datetime (inclusive) to download. data_inner_steps: Data will be sliced into
data_inner_steps*5minute chunks. output_directory: Directory to which the
satellite data should be saved. lon_min: The west-most longitude (in
degrees) of the bounding box to download. lon_max: The east-most longitude
(in degrees) of the bounding box to download. lat_min: The south-most
latitude (in degrees) of the bounding box to download. lat_max: The
north-most latitude (in degrees) of the bounding box to download. get_hrv:
Whether to download the HRV data, else non-HRV is downloaded.
override_date_bounds: Whether to override the date range limits.
Raises: FileNotFoundError: If the output directory doesn't exist.
ValueError: If there are issues with the date range or if output files already
exist.

╭─ Arguments ────────────────────────────────────────────────────────────────────╮
│ * start_date TEXT Start date in 'YYYY-MM-DD HH:MM' format │
│ [default: None] │
│ [required] │
│ * end_date TEXT End date in 'YYYY-MM-DD HH:MM' format │
│ [default: None] │
│ [required] │
│ * output_directory TEXT Directory to save the satellite data │
│ [default: None] │
│ [required] │
╰────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ──────────────────────────────────────────────────────────────────────╮
│ --data-inner-steps INTEGER Data will be sliced │
│ into │
│ data_inner_steps*5mi… │
│ chunks │
│ [default: 3] │
│ --get-hrv --no-get-hrv Whether to download │
│ HRV data │
│ [default: no-get-hrv] │
│ --override-date-boun… --no-override-date… Whether to override │
│ date range limits │
│ [default: │
│ no-override-date-bou… │
│ --lon-min FLOAT Minimum longitude │
│ [default: -16] │
│ --lon-max FLOAT Maximum longitude │
│ [default: 10] │
│ --lat-min FLOAT Minimum latitude │
│ [default: 45] │
│ --lat-max FLOAT Maximum latitude │
│ [default: 70] │
│ --help Show this message and │
│ exit. │
╰────────────────────────────────────────────────────────────────────────────────╯

```

Example:

```bash
cloudcasting download "2020-06-01 00:00" "2020-06-30 23:55" "path/to/my/dir/data.zarr"
```

Full options:

```bash
> cloudcasting download --help

Usage: cloudcasting download [OPTIONS] START_DATE
END_DATE OUTPUT_DIRECTORY

╭─ Arguments ──────────────────────────────────────────╮
│ * start_date TEXT Start date in │
│ 'YYYY-MM-DD HH:MM' │
│ format │
│ [default: None] │
│ [required] │
│ * end_date TEXT End date in │
│ 'YYYY-MM-DD HH:MM' │
│ format │
│ [default: None] │
│ [required] │
│ * output_directory TEXT Directory to save │
│ the satellite data │
│ [default: None] │
│ [required] │
╰──────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────╮
│ --download-f… TEXT Frequency to │
│ download data │
│ in pandas │
│ datetime │
│ format │
│ [default: │
│ 15min] │
│ --get-hrv --no-get-h… Whether to │
│ download HRV │
│ data │
│ [default: │
│ no-get-hrv] │
│ --override-d… --no-overr… Whether to │
│ override date │
│ range limits │
│ [default: │
│ no-override-… │
│ --lon-min FLOAT Minimum │
│ longitude │
│ [default: │
│ -16] │
│ --lon-max FLOAT Maximum │
│ longitude │
│ [default: 10] │
│ --lat-min FLOAT Minimum │
│ latitude │
│ [default: 45] │
│ --lat-max FLOAT Maximum │
│ latitude │
│ [default: 70] │
│ --valid-set --no-valid… Whether to │
│ filter data │
│ from 2022 to │
│ download the │
│ validation │
│ set (every 2 │
│ weeks). │
│ [default: │
│ no-valid-set] │
│ --help Show this │
│ message and │
│ exit. │
╰──────────────────────────────────────────────────────╯
```

## Contributing

Expand Down
61 changes: 48 additions & 13 deletions src/cloudcasting/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ def download_satellite_data(
start_date: Annotated[str, typer.Argument(help="Start date in 'YYYY-MM-DD HH:MM' format")],
end_date: Annotated[str, typer.Argument(help="End date in 'YYYY-MM-DD HH:MM' format")],
output_directory: Annotated[str, typer.Argument(help="Directory to save the satellite data")],
data_inner_steps: Annotated[
int, typer.Option(help="Data will be sliced into data_inner_steps*5minute chunks")
] = 3,
download_frequency: Annotated[
str, typer.Option(help="Frequency to download data in pandas datetime format")
] = "15min",
get_hrv: Annotated[bool, typer.Option(help="Whether to download HRV data")] = False,
override_date_bounds: Annotated[
bool, typer.Option(help="Whether to override date range limits")
Expand All @@ -84,6 +84,12 @@ def download_satellite_data(
lon_max: Annotated[float, typer.Option(help="Maximum longitude")] = 10,
lat_min: Annotated[float, typer.Option(help="Minimum latitude")] = 45,
lat_max: Annotated[float, typer.Option(help="Maximum latitude")] = 70,
valid_set: Annotated[
bool,
typer.Option(
help="Whether to filter data from 2022 to download the validation set (every 2 weeks)."
),
] = False,
) -> None:
"""
Download a selection of the available EUMETSAT data.
Expand All @@ -107,6 +113,7 @@ def download_satellite_data(
FileNotFoundError: If the output directory doesn't exist.
ValueError: If there are issues with the date range or if output files already exist.
"""

# Check output directory exists
if not os.path.isdir(output_directory):
msg = (
Expand All @@ -115,6 +122,14 @@ def download_satellite_data(
)
raise FileNotFoundError(msg)

# Check download frequency is valid (i.e. is a pandas frequency + multiple of 5 minutes)
if np.mod(pd.Timedelta(download_frequency).value, pd.Timedelta("5min").value) != 0:
msg = (
f"Download frequency {download_frequency} is not a multiple of 5 minutes. "
"Please choose a valid frequency."
)
raise ValueError(msg)

start_date_stamp = pd.Timestamp(start_date)
end_date_stamp = pd.Timestamp(end_date)

Expand All @@ -129,6 +144,17 @@ def download_satellite_data(

years = range(start_date_stamp.year, end_date_stamp.year + 1)

# Ceiling the start date to nearest multiple of the download frequency
# Breaks down over multiple days due to starting at the Unix epoch (1970-01-01 Thursday),
# e.g. 2022-01-01 ceiled to 1 week will be 2022-01-06 (the closest Thursday to 2022-01-01).
range_start = (
start_date_stamp.ceil(download_frequency)
if pd.Timedelta(download_frequency) <= pd.Timedelta("1day")
else start_date_stamp
)
# Create a list of dates to download
dates_to_download = pd.date_range(range_start, end_date_stamp, freq=download_frequency)

# Check that none of the filenames we will save to already exist
file_end = "hrv.zarr" if get_hrv else "nonhrv.zarr"
for year in years:
Expand All @@ -145,14 +171,22 @@ def download_satellite_data(
path = _get_sat_public_dataset_path(year, is_hrv=get_hrv)

# Slice the data from this year which are between the start and end dates.
ds = (
xr.open_zarr(path, chunks={})
.sortby("time")
.sel(time=slice(start_date_stamp, end_date_stamp))
)
# Also filter out to strict multiples of the desired time delta specified
# in `data_inner_steps` (which should be slighly more robust to missing values).
ds = ds.sel(time=np.mod(ds.time.dt.minute, data_inner_steps * 5) == 0)
ds = xr.open_zarr(path, chunks={}).sortby("time").sel(time=dates_to_download)

if year == 2022:
set_str = "Validation" if valid_set else "Training"
day_str = "15" if valid_set else "1"
logger.info("Data in 2022 will be downloaded every 2 weeks due to train/valid split.")
logger.info("%s set selected: Starting day will be %s", set_str, day_str)
# integer division by 14 will tell us the week we're on.
# checking the mod wrt 2 will let us select ever 2 weeks (weeks are 1-indexed)
# valid set is defined as from week 3-4, 7-8 etc. (where the mod is != 2).
mask = (
np.mod(ds.time.dt.dayofyear // 14, 2) != 0
if valid_set
else np.mod(ds.time.dt.dayofyear // 14, 2) == 0
)
ds = ds.sel(time=mask)

# Convert lon-lat bounds to geostationary-coords
(x_min, x_max), (y_min, y_max) = lon_lat_to_geostationary_area_coords(
Expand Down Expand Up @@ -182,7 +216,8 @@ def download_satellite_data(
ds = ds.chunk(target_chunks_dict)

# Save data
output_zarr_file = f"{output_directory}/{year}_{file_end}"
with ProgressBar(dt=5):
valid_set_file_str = "validation" if valid_set else "training"
output_zarr_file = f"{output_directory}/{year}_{valid_set_file_str}_{file_end}"
with ProgressBar(dt=1):
ds.to_zarr(output_zarr_file)
logger.info("Data for %s saved to %s.", year, output_zarr_file)
45 changes: 45 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os

import pytest
from typer.testing import CliRunner

from cloudcasting.cli import app


@pytest.fixture()
def runner():
return CliRunner()


@pytest.fixture()
def temp_output_dir(tmp_path):
return str(tmp_path)


def test_download_satellite_data(runner, temp_output_dir):
# Define test parameters
start_date = "2023-01-01 00:00"
end_date = "2023-01-01 00:30"

# Run the CLI command to download the file
result = runner.invoke(
app,
[
"download",
start_date,
end_date,
temp_output_dir,
"--download-frequency=15min",
"--lon-min=-16",
"--lon-max=10",
"--lat-min=45",
"--lat-max=70",
],
)

# Check if the command executed successfully
assert result.exit_code == 0

# Check if the output file was created
expected_file = os.path.join(temp_output_dir, "2023_training_nonhrv.zarr")
assert os.path.exists(expected_file)
Loading
Loading