Merge pull request #3 from ClimeTrend/validation-filtering

Add 2-weekly slicing for 2022
alan-turing-institute · Jul 24, 2024 · cc3af87 · cc3af87
2 parents be7fdd3 + dabc375
commit cc3af87
Show file tree

Hide file tree

Showing 4 changed files with 264 additions and 97 deletions.
diff --git a/README.md b/README.md
@@ -24,72 +24,82 @@ pre-commit install
 For making changes, see the [guidance on development](https://github.com/alan-turing-institute/python-project-template?tab=readme-ov-file#setting-up-a-new-project) from the template that generated this project.
 
 ## Usage
-```bash
- cloudcasting download --help
-
- Usage: cloudcasting download [OPTIONS] START_DATE END_DATE OUTPUT_DIRECTORY
-
- Download a selection of the available EUMETSAT data.
- Each calendar year of data within the supplied date range will be saved to a
- separate file in the output directory.
- Args:     start_date: First datetime (inclusive) to download.     end_date: Last
- datetime (inclusive) to download.     data_inner_steps: Data will be sliced into
- data_inner_steps*5minute chunks.     output_directory: Directory to which the
- satellite data should be saved.     lon_min: The west-most longitude (in
- degrees) of the bounding box to download.     lon_max: The east-most longitude
- (in degrees) of the bounding box to download.     lat_min: The south-most
- latitude (in degrees) of the bounding box to download.     lat_max: The
- north-most latitude (in degrees) of the bounding box to download.     get_hrv:
- Whether to download the HRV data, else non-HRV is downloaded.
- override_date_bounds: Whether to override the date range limits.
- Raises:     FileNotFoundError: If the output directory doesn't exist.
- ValueError: If there are issues with the date range or if output files already
- exist.
-
-╭─ Arguments ────────────────────────────────────────────────────────────────────╮
-│ *    start_date            TEXT  Start date in 'YYYY-MM-DD HH:MM' format       │
-│                                  [default: None]                               │
-│                                  [required]                                    │
-│ *    end_date              TEXT  End date in 'YYYY-MM-DD HH:MM' format         │
-│                                  [default: None]                               │
-│                                  [required]                                    │
-│ *    output_directory      TEXT  Directory to save the satellite data          │
-│                                  [default: None]                               │
-│                                  [required]                                    │
-╰────────────────────────────────────────────────────────────────────────────────╯
-╭─ Options ──────────────────────────────────────────────────────────────────────╮
-│ --data-inner-steps                              INTEGER  Data will be sliced   │
-│                                                          into                  │
-│                                                          data_inner_steps*5mi… │
-│                                                          chunks                │
-│                                                          [default: 3]          │
-│ --get-hrv                --no-get-hrv                    Whether to download   │
-│                                                          HRV data              │
-│                                                          [default: no-get-hrv] │
-│ --override-date-boun…    --no-override-date…             Whether to override   │
-│                                                          date range limits     │
-│                                                          [default:             │
-│                                                          no-override-date-bou… │
-│ --lon-min                                       FLOAT    Minimum longitude     │
-│                                                          [default: -16]        │
-│ --lon-max                                       FLOAT    Maximum longitude     │
-│                                                          [default: 10]         │
-│ --lat-min                                       FLOAT    Minimum latitude      │
-│                                                          [default: 45]         │
-│ --lat-max                                       FLOAT    Maximum latitude      │
-│                                                          [default: 70]         │
-│ --help                                                   Show this message and │
-│                                                          exit.                 │
-╰────────────────────────────────────────────────────────────────────────────────╯
-
-```
 
 Example:
 
 ```bash
 cloudcasting download "2020-06-01 00:00" "2020-06-30 23:55" "path/to/my/dir/data.zarr"
 ```
 
+Full options:
+
+```bash
+> cloudcasting download --help
+
+ Usage: cloudcasting download [OPTIONS] START_DATE
+                              END_DATE OUTPUT_DIRECTORY
+
+╭─ Arguments ──────────────────────────────────────────╮
+│ *    start_date            TEXT  Start date in       │
+│                                  'YYYY-MM-DD HH:MM'  │
+│                                  format              │
+│                                  [default: None]     │
+│                                  [required]          │
+│ *    end_date              TEXT  End date in         │
+│                                  'YYYY-MM-DD HH:MM'  │
+│                                  format              │
+│                                  [default: None]     │
+│                                  [required]          │
+│ *    output_directory      TEXT  Directory to save   │
+│                                  the satellite data  │
+│                                  [default: None]     │
+│                                  [required]          │
+╰──────────────────────────────────────────────────────╯
+╭─ Options ────────────────────────────────────────────╮
+│ --download-f…                   TEXT   Frequency to  │
+│                                        download data │
+│                                        in pandas     │
+│                                        datetime      │
+│                                        format        │
+│                                        [default:     │
+│                                        15min]        │
+│ --get-hrv        --no-get-h…           Whether to    │
+│                                        download HRV  │
+│                                        data          │
+│                                        [default:     │
+│                                        no-get-hrv]   │
+│ --override-d…    --no-overr…           Whether to    │
+│                                        override date │
+│                                        range limits  │
+│                                        [default:     │
+│                                        no-override-… │
+│ --lon-min                       FLOAT  Minimum       │
+│                                        longitude     │
+│                                        [default:     │
+│                                        -16]          │
+│ --lon-max                       FLOAT  Maximum       │
+│                                        longitude     │
+│                                        [default: 10] │
+│ --lat-min                       FLOAT  Minimum       │
+│                                        latitude      │
+│                                        [default: 45] │
+│ --lat-max                       FLOAT  Maximum       │
+│                                        latitude      │
+│                                        [default: 70] │
+│ --valid-set      --no-valid…           Whether to    │
+│                                        filter data   │
+│                                        from 2022 to  │
+│                                        download the  │
+│                                        validation    │
+│                                        set (every 2  │
+│                                        weeks).       │
+│                                        [default:     │
+│                                        no-valid-set] │
+│ --help                                 Show this     │
+│                                        message and   │
+│                                        exit.         │
+╰──────────────────────────────────────────────────────╯
+```
 
 ## Contributing
 

diff --git a/src/cloudcasting/download.py b/src/cloudcasting/download.py
@@ -72,9 +72,9 @@ def download_satellite_data(
     start_date: Annotated[str, typer.Argument(help="Start date in 'YYYY-MM-DD HH:MM' format")],
     end_date: Annotated[str, typer.Argument(help="End date in 'YYYY-MM-DD HH:MM' format")],
     output_directory: Annotated[str, typer.Argument(help="Directory to save the satellite data")],
-    data_inner_steps: Annotated[
-        int, typer.Option(help="Data will be sliced into data_inner_steps*5minute chunks")
-    ] = 3,
+    download_frequency: Annotated[
+        str, typer.Option(help="Frequency to download data in pandas datetime format")
+    ] = "15min",
     get_hrv: Annotated[bool, typer.Option(help="Whether to download HRV data")] = False,
     override_date_bounds: Annotated[
         bool, typer.Option(help="Whether to override date range limits")
@@ -83,6 +83,12 @@ def download_satellite_data(
     lon_max: Annotated[float, typer.Option(help="Maximum longitude")] = 10,
     lat_min: Annotated[float, typer.Option(help="Minimum latitude")] = 45,
     lat_max: Annotated[float, typer.Option(help="Maximum latitude")] = 70,
+    valid_set: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to filter data from 2022 to download the validation set (every 2 weeks)."
+        ),
+    ] = False,
 ) -> None:
     """
     Download a selection of the available EUMETSAT data.
@@ -106,6 +112,7 @@ def download_satellite_data(
         FileNotFoundError: If the output directory doesn't exist.
         ValueError: If there are issues with the date range or if output files already exist.
     """
+
     # Check output directory exists
     if not os.path.isdir(output_directory):
         msg = (
@@ -114,6 +121,14 @@ def download_satellite_data(
         )
         raise FileNotFoundError(msg)
 
+    # Check download frequency is valid (i.e. is a pandas frequency + multiple of 5 minutes)
+    if np.mod(pd.Timedelta(download_frequency).value, pd.Timedelta("5min").value) != 0:
+        msg = (
+            f"Download frequency {download_frequency} is not a multiple of 5 minutes. "
+            "Please choose a valid frequency."
+        )
+        raise ValueError(msg)
+
     start_date_stamp = pd.Timestamp(start_date)
     end_date_stamp = pd.Timestamp(end_date)
 
@@ -128,6 +143,17 @@ def download_satellite_data(
 
     years = range(start_date_stamp.year, end_date_stamp.year + 1)
 
+    # Ceiling the start date to nearest multiple of the download frequency
+    # Breaks down over multiple days due to starting at the Unix epoch (1970-01-01 Thursday),
+    # e.g. 2022-01-01 ceiled to 1 week will be 2022-01-06 (the closest Thursday to 2022-01-01).
+    range_start = (
+        start_date_stamp.ceil(download_frequency)
+        if pd.Timedelta(download_frequency) <= pd.Timedelta("1day")
+        else start_date_stamp
+    )
+    # Create a list of dates to download
+    dates_to_download = pd.date_range(range_start, end_date_stamp, freq=download_frequency)
+
     # Check that none of the filenames we will save to already exist
     file_end = "hrv.zarr" if get_hrv else "nonhrv.zarr"
     for year in years:
@@ -144,14 +170,22 @@ def download_satellite_data(
         path = _get_sat_public_dataset_path(year, is_hrv=get_hrv)
 
         # Slice the data from this year which are between the start and end dates.
-        ds = (
-            xr.open_zarr(path, chunks={})
-            .sortby("time")
-            .sel(time=slice(start_date_stamp, end_date_stamp))
-        )
-        # Also filter out to strict multiples of the desired time delta specified
-        # in `data_inner_steps` (which should be slighly more robust to missing values).
-        ds = ds.sel(time=np.mod(ds.time.dt.minute, data_inner_steps * 5) == 0)
+        ds = xr.open_zarr(path, chunks={}).sortby("time").sel(time=dates_to_download)
+
+        if year == 2022:
+            set_str = "Validation" if valid_set else "Training"
+            day_str = "15" if valid_set else "1"
+            logger.info("Data in 2022 will be downloaded every 2 weeks due to train/valid split.")
+            logger.info("%s set selected: Starting day will be %s", set_str, day_str)
+            # integer division by 14 will tell us the week we're on.
+            # checking the mod wrt 2 will let us select ever 2 weeks (weeks are 1-indexed)
+            # valid set is defined as from week 3-4, 7-8 etc. (where the mod is != 2).
+            mask = (
+                np.mod(ds.time.dt.dayofyear // 14, 2) != 0
+                if valid_set
+                else np.mod(ds.time.dt.dayofyear // 14, 2) == 0
+            )
+            ds = ds.sel(time=mask)
 
         # Convert lon-lat bounds to geostationary-coords
         (x_min, x_max), (y_min, y_max) = lon_lat_to_geostationary_area_coords(
@@ -181,7 +215,8 @@ def download_satellite_data(
         ds = ds.chunk(target_chunks_dict)
 
         # Save data
-        output_zarr_file = f"{output_directory}/{year}_{file_end}"
-        with ProgressBar(dt=5):
+        valid_set_file_str = "validation" if valid_set else "training"
+        output_zarr_file = f"{output_directory}/{year}_{valid_set_file_str}_{file_end}"
+        with ProgressBar(dt=1):
             ds.to_zarr(output_zarr_file)
         logger.info("Data for %s saved to %s.", year, output_zarr_file)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -0,0 +1,45 @@
+import os
+
+import pytest
+from typer.testing import CliRunner
+
+from cloudcasting.cli import app
+
+
+@pytest.fixture()
+def runner():
+    return CliRunner()
+
+
+@pytest.fixture()
+def temp_output_dir(tmp_path):
+    return str(tmp_path)
+
+
+def test_download_satellite_data(runner, temp_output_dir):
+    # Define test parameters
+    start_date = "2023-01-01 00:00"
+    end_date = "2023-01-01 00:30"
+
+    # Run the CLI command to download the file
+    result = runner.invoke(
+        app,
+        [
+            "download",
+            start_date,
+            end_date,
+            temp_output_dir,
+            "--download-frequency=15min",
+            "--lon-min=-16",
+            "--lon-max=10",
+            "--lat-min=45",
+            "--lat-max=70",
+        ],
+    )
+
+    # Check if the command executed successfully
+    assert result.exit_code == 0
+
+    # Check if the output file was created
+    expected_file = os.path.join(temp_output_dir, "2023_training_nonhrv.zarr")
+    assert os.path.exists(expected_file)