Skip to content

Commit

Permalink
Merge pull request #3 from ClimeTrend/validation-filtering
Browse files Browse the repository at this point in the history
Add 2-weekly slicing for 2022
  • Loading branch information
phinate authored Jul 24, 2024
2 parents be7fdd3 + dabc375 commit cc3af87
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 97 deletions.
128 changes: 69 additions & 59 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,72 +24,82 @@ pre-commit install
For making changes, see the [guidance on development](https://github.com/alan-turing-institute/python-project-template?tab=readme-ov-file#setting-up-a-new-project) from the template that generated this project.

## Usage
```bash
 cloudcasting download --help

Usage: cloudcasting download [OPTIONS] START_DATE END_DATE OUTPUT_DIRECTORY

Download a selection of the available EUMETSAT data.
Each calendar year of data within the supplied date range will be saved to a
separate file in the output directory.
Args: start_date: First datetime (inclusive) to download. end_date: Last
datetime (inclusive) to download. data_inner_steps: Data will be sliced into
data_inner_steps*5minute chunks. output_directory: Directory to which the
satellite data should be saved. lon_min: The west-most longitude (in
degrees) of the bounding box to download. lon_max: The east-most longitude
(in degrees) of the bounding box to download. lat_min: The south-most
latitude (in degrees) of the bounding box to download. lat_max: The
north-most latitude (in degrees) of the bounding box to download. get_hrv:
Whether to download the HRV data, else non-HRV is downloaded.
override_date_bounds: Whether to override the date range limits.
Raises: FileNotFoundError: If the output directory doesn't exist.
ValueError: If there are issues with the date range or if output files already
exist.
╭─ Arguments ────────────────────────────────────────────────────────────────────╮
│ * start_date TEXT Start date in 'YYYY-MM-DD HH:MM' format │
│ [default: None] │
│ [required] │
│ * end_date TEXT End date in 'YYYY-MM-DD HH:MM' format │
│ [default: None] │
│ [required] │
│ * output_directory TEXT Directory to save the satellite data │
│ [default: None] │
│ [required] │
╰────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ──────────────────────────────────────────────────────────────────────╮
│ --data-inner-steps INTEGER Data will be sliced │
│ into │
│ data_inner_steps*5mi… │
│ chunks │
│ [default: 3] │
│ --get-hrv --no-get-hrv Whether to download │
│ HRV data │
│ [default: no-get-hrv] │
│ --override-date-boun… --no-override-date… Whether to override │
│ date range limits │
│ [default: │
│ no-override-date-bou… │
│ --lon-min FLOAT Minimum longitude │
│ [default: -16] │
│ --lon-max FLOAT Maximum longitude │
│ [default: 10] │
│ --lat-min FLOAT Minimum latitude │
│ [default: 45] │
│ --lat-max FLOAT Maximum latitude │
│ [default: 70] │
│ --help Show this message and │
│ exit. │
╰────────────────────────────────────────────────────────────────────────────────╯
```

Example:

```bash
cloudcasting download "2020-06-01 00:00" "2020-06-30 23:55" "path/to/my/dir/data.zarr"
```

Full options:

```bash
> cloudcasting download --help

Usage: cloudcasting download [OPTIONS] START_DATE
END_DATE OUTPUT_DIRECTORY

╭─ Arguments ──────────────────────────────────────────╮
* start_date TEXT Start date in
'YYYY-MM-DD HH:MM'
│ format │
│ [default: None] │
│ [required] │
* end_date TEXT End date in
'YYYY-MM-DD HH:MM'
│ format │
│ [default: None] │
│ [required] │
* output_directory TEXT Directory to save │
│ the satellite data │
│ [default: None] │
│ [required] │
╰──────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────╮
│ --download-f… TEXT Frequency to │
│ download data │
in pandas │
│ datetime │
│ format │
│ [default: │
│ 15min] │
│ --get-hrv --no-get-h… Whether to │
│ download HRV │
│ data │
│ [default: │
│ no-get-hrv] │
│ --override-d… --no-overr… Whether to │
│ override date │
│ range limits │
│ [default: │
│ no-override-… │
│ --lon-min FLOAT Minimum │
│ longitude │
│ [default: │
│ -16] │
│ --lon-max FLOAT Maximum │
│ longitude │
│ [default: 10] │
│ --lat-min FLOAT Minimum │
│ latitude │
│ [default: 45] │
│ --lat-max FLOAT Maximum │
│ latitude │
│ [default: 70] │
│ --valid-set --no-valid… Whether to │
│ filter data │
│ from 2022 to │
│ download the │
│ validation │
set (every 2 │
│ weeks). │
│ [default: │
│ no-valid-set] │
│ --help Show this │
│ message and │
│ exit. │
╰──────────────────────────────────────────────────────╯
```
## Contributing
Expand Down
61 changes: 48 additions & 13 deletions src/cloudcasting/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def download_satellite_data(
start_date: Annotated[str, typer.Argument(help="Start date in 'YYYY-MM-DD HH:MM' format")],
end_date: Annotated[str, typer.Argument(help="End date in 'YYYY-MM-DD HH:MM' format")],
output_directory: Annotated[str, typer.Argument(help="Directory to save the satellite data")],
data_inner_steps: Annotated[
int, typer.Option(help="Data will be sliced into data_inner_steps*5minute chunks")
] = 3,
download_frequency: Annotated[
str, typer.Option(help="Frequency to download data in pandas datetime format")
] = "15min",
get_hrv: Annotated[bool, typer.Option(help="Whether to download HRV data")] = False,
override_date_bounds: Annotated[
bool, typer.Option(help="Whether to override date range limits")
Expand All @@ -83,6 +83,12 @@ def download_satellite_data(
lon_max: Annotated[float, typer.Option(help="Maximum longitude")] = 10,
lat_min: Annotated[float, typer.Option(help="Minimum latitude")] = 45,
lat_max: Annotated[float, typer.Option(help="Maximum latitude")] = 70,
valid_set: Annotated[
bool,
typer.Option(
help="Whether to filter data from 2022 to download the validation set (every 2 weeks)."
),
] = False,
) -> None:
"""
Download a selection of the available EUMETSAT data.
Expand All @@ -106,6 +112,7 @@ def download_satellite_data(
FileNotFoundError: If the output directory doesn't exist.
ValueError: If there are issues with the date range or if output files already exist.
"""

# Check output directory exists
if not os.path.isdir(output_directory):
msg = (
Expand All @@ -114,6 +121,14 @@ def download_satellite_data(
)
raise FileNotFoundError(msg)

# Check download frequency is valid (i.e. is a pandas frequency + multiple of 5 minutes)
if np.mod(pd.Timedelta(download_frequency).value, pd.Timedelta("5min").value) != 0:
msg = (
f"Download frequency {download_frequency} is not a multiple of 5 minutes. "
"Please choose a valid frequency."
)
raise ValueError(msg)

start_date_stamp = pd.Timestamp(start_date)
end_date_stamp = pd.Timestamp(end_date)

Expand All @@ -128,6 +143,17 @@ def download_satellite_data(

years = range(start_date_stamp.year, end_date_stamp.year + 1)

# Ceiling the start date to nearest multiple of the download frequency
# Breaks down over multiple days due to starting at the Unix epoch (1970-01-01 Thursday),
# e.g. 2022-01-01 ceiled to 1 week will be 2022-01-06 (the closest Thursday to 2022-01-01).
range_start = (
start_date_stamp.ceil(download_frequency)
if pd.Timedelta(download_frequency) <= pd.Timedelta("1day")
else start_date_stamp
)
# Create a list of dates to download
dates_to_download = pd.date_range(range_start, end_date_stamp, freq=download_frequency)

# Check that none of the filenames we will save to already exist
file_end = "hrv.zarr" if get_hrv else "nonhrv.zarr"
for year in years:
Expand All @@ -144,14 +170,22 @@ def download_satellite_data(
path = _get_sat_public_dataset_path(year, is_hrv=get_hrv)

# Slice the data from this year which are between the start and end dates.
ds = (
xr.open_zarr(path, chunks={})
.sortby("time")
.sel(time=slice(start_date_stamp, end_date_stamp))
)
# Also filter out to strict multiples of the desired time delta specified
# in `data_inner_steps` (which should be slighly more robust to missing values).
ds = ds.sel(time=np.mod(ds.time.dt.minute, data_inner_steps * 5) == 0)
ds = xr.open_zarr(path, chunks={}).sortby("time").sel(time=dates_to_download)

if year == 2022:
set_str = "Validation" if valid_set else "Training"
day_str = "15" if valid_set else "1"
logger.info("Data in 2022 will be downloaded every 2 weeks due to train/valid split.")
logger.info("%s set selected: Starting day will be %s", set_str, day_str)
# integer division by 14 will tell us the week we're on.
# checking the mod wrt 2 will let us select ever 2 weeks (weeks are 1-indexed)
# valid set is defined as from week 3-4, 7-8 etc. (where the mod is != 2).
mask = (
np.mod(ds.time.dt.dayofyear // 14, 2) != 0
if valid_set
else np.mod(ds.time.dt.dayofyear // 14, 2) == 0
)
ds = ds.sel(time=mask)

# Convert lon-lat bounds to geostationary-coords
(x_min, x_max), (y_min, y_max) = lon_lat_to_geostationary_area_coords(
Expand Down Expand Up @@ -181,7 +215,8 @@ def download_satellite_data(
ds = ds.chunk(target_chunks_dict)

# Save data
output_zarr_file = f"{output_directory}/{year}_{file_end}"
with ProgressBar(dt=5):
valid_set_file_str = "validation" if valid_set else "training"
output_zarr_file = f"{output_directory}/{year}_{valid_set_file_str}_{file_end}"
with ProgressBar(dt=1):
ds.to_zarr(output_zarr_file)
logger.info("Data for %s saved to %s.", year, output_zarr_file)
45 changes: 45 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os

import pytest
from typer.testing import CliRunner

from cloudcasting.cli import app


@pytest.fixture()
def runner():
return CliRunner()


@pytest.fixture()
def temp_output_dir(tmp_path):
return str(tmp_path)


def test_download_satellite_data(runner, temp_output_dir):
# Define test parameters
start_date = "2023-01-01 00:00"
end_date = "2023-01-01 00:30"

# Run the CLI command to download the file
result = runner.invoke(
app,
[
"download",
start_date,
end_date,
temp_output_dir,
"--download-frequency=15min",
"--lon-min=-16",
"--lon-max=10",
"--lat-min=45",
"--lat-max=70",
],
)

# Check if the command executed successfully
assert result.exit_code == 0

# Check if the output file was created
expected_file = os.path.join(temp_output_dir, "2023_training_nonhrv.zarr")
assert os.path.exists(expected_file)
Loading

0 comments on commit cc3af87

Please sign in to comment.