Merge pull request #64 from alan-turing-institute/reduce_val_times

reduce the val 2022 samples with min gap of 1 hour
alan-turing-institute · Sep 19, 2024 · 24a0f34 · 24a0f34
2 parents 3521984 + 3388490
commit 24a0f34
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 11 deletions.
diff --git a/examples/find_test_2022_t0_times.py b/examples/find_test_2022_t0_times.py
@@ -18,6 +18,9 @@
 # The current FORECAST_HORIZON_MINUTES is 3 hours so we'll set this conservatively to 6 hours
 MAX_HISTORY_MINUTES = 6 * 60
 
+# We filter t0 times so they have to have a gap of at least this long between consecutive times
+MIN_GAP_SIZE = pd.Timedelta("1hour")
+
 # Open the 2022 dataset
 ds = xr.open_zarr(_get_sat_public_dataset_path(2022, is_hrv=False))
 
@@ -30,17 +33,27 @@
 ds = ds.sel(time=mask)
 
 
-# Find the valid t0 times
-valid_t0_times = find_valid_t0_times(
+# Find the t0 times that we have satellite data for
+available_t0_times = find_valid_t0_times(
     datetimes=pd.DatetimeIndex(ds.time),
     history_mins=MAX_HISTORY_MINUTES,
     forecast_mins=FORECAST_HORIZON_MINUTES,
     sample_freq_mins=DATA_INTERVAL_SPACING_MINUTES,
 )
 
+# Filter the t0 times so they have gaps of at least 1 hour
+_filtered_t0_times = [available_t0_times[0]]
+
+for t in available_t0_times[1:]:
+    if (t - _filtered_t0_times[-1]) >= MIN_GAP_SIZE:
+        _filtered_t0_times.append(t)
+
+filtered_t0_times = pd.DatetimeIndex(_filtered_t0_times)
+
+
 # Print the valid t0 times to sanity check
-print(f"Number of available t0 times: {len(valid_t0_times)}")
-print(f"Actual available t0 times: {valid_t0_times}")
+print(f"Number of available t0 times: {len(filtered_t0_times)}")
+print(f"Actual available t0 times: {filtered_t0_times}")
 
 
 # Find the path of the cloudcasting package so we can save the valid times into it
@@ -53,7 +66,7 @@
 
 # Save the valid t0 times
 filename = "test_2022_t0_times.csv"
-df = pd.DataFrame(valid_t0_times, columns=["t0_time"]).set_index("t0_time")
+df = pd.DataFrame(filtered_t0_times, columns=["t0_time"]).set_index("t0_time")
 df.to_csv(
     f"{package_path}/data/{filename}.zip",
     compression={

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cloudcasting"
-version = "0.2.1"
+version = "0.3.0"
 authors = [
   { name = "cloudcasting Maintainers", email = "[email protected]" },
 ]

diff --git a/src/cloudcasting/data/test_2022_t0_times.csv.zip b/src/cloudcasting/data/test_2022_t0_times.csv.zip
diff --git a/src/cloudcasting/validation.py b/src/cloudcasting/validation.py
@@ -30,9 +30,9 @@
 # defined in manchester prize technical document
 WANDB_ENTITY = "manchester_prize"
 VIDEO_SAMPLE_DATES = [
-    "2022-01-17 10:30",
-    "2022-04-11 06:30",
-    "2022-06-10 10:30",
+    "2022-01-17 11:00",
+    "2022-04-11 06:00",
+    "2022-06-10 11:00",
     "2022-09-30 18:15",
 ]
 VIDEO_SAMPLE_CHANNELS = ["VIS008", "IR_087"]

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -164,8 +164,8 @@ def test_validation_dataset(val_sat_zarr_path, val_dataset_hyperparams):
         sample_freq_mins=DATA_INTERVAL_SPACING_MINUTES,
     )
 
-    # There are 14949 init times which all models must make predictions for
-    assert len(dataset) == 14949
+    # There are 3744 init times which all models must make predictions for
+    assert len(dataset) == 3744
 
     X, y = dataset[0]