Skip to content

Commit

Permalink
Merge pull request #64 from alan-turing-institute/reduce_val_times
Browse files Browse the repository at this point in the history
reduce the val 2022 samples with min gap of 1 hour
  • Loading branch information
phinate authored Sep 19, 2024
2 parents 3521984 + 3388490 commit 24a0f34
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 11 deletions.
23 changes: 18 additions & 5 deletions examples/find_test_2022_t0_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# The current FORECAST_HORIZON_MINUTES is 3 hours so we'll set this conservatively to 6 hours
MAX_HISTORY_MINUTES = 6 * 60

# We filter t0 times so they have to have a gap of at least this long between consecutive times
MIN_GAP_SIZE = pd.Timedelta("1hour")

# Open the 2022 dataset
ds = xr.open_zarr(_get_sat_public_dataset_path(2022, is_hrv=False))

Expand All @@ -30,17 +33,27 @@
ds = ds.sel(time=mask)


# Find the valid t0 times
valid_t0_times = find_valid_t0_times(
# Find the t0 times that we have satellite data for
available_t0_times = find_valid_t0_times(
datetimes=pd.DatetimeIndex(ds.time),
history_mins=MAX_HISTORY_MINUTES,
forecast_mins=FORECAST_HORIZON_MINUTES,
sample_freq_mins=DATA_INTERVAL_SPACING_MINUTES,
)

# Filter the t0 times so they have gaps of at least 1 hour
_filtered_t0_times = [available_t0_times[0]]

for t in available_t0_times[1:]:
if (t - _filtered_t0_times[-1]) >= MIN_GAP_SIZE:
_filtered_t0_times.append(t)

filtered_t0_times = pd.DatetimeIndex(_filtered_t0_times)


# Print the valid t0 times to sanity check
print(f"Number of available t0 times: {len(valid_t0_times)}")
print(f"Actual available t0 times: {valid_t0_times}")
print(f"Number of available t0 times: {len(filtered_t0_times)}")
print(f"Actual available t0 times: {filtered_t0_times}")


# Find the path of the cloudcasting package so we can save the valid times into it
Expand All @@ -53,7 +66,7 @@

# Save the valid t0 times
filename = "test_2022_t0_times.csv"
df = pd.DataFrame(valid_t0_times, columns=["t0_time"]).set_index("t0_time")
df = pd.DataFrame(filtered_t0_times, columns=["t0_time"]).set_index("t0_time")
df.to_csv(
f"{package_path}/data/{filename}.zip",
compression={
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cloudcasting"
version = "0.2.1"
version = "0.3.0"
authors = [
{ name = "cloudcasting Maintainers", email = "[email protected]" },
]
Expand Down
Binary file modified src/cloudcasting/data/test_2022_t0_times.csv.zip
Binary file not shown.
6 changes: 3 additions & 3 deletions src/cloudcasting/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
# defined in manchester prize technical document
WANDB_ENTITY = "manchester_prize"
VIDEO_SAMPLE_DATES = [
"2022-01-17 10:30",
"2022-04-11 06:30",
"2022-06-10 10:30",
"2022-01-17 11:00",
"2022-04-11 06:00",
"2022-06-10 11:00",
"2022-09-30 18:15",
]
VIDEO_SAMPLE_CHANNELS = ["VIS008", "IR_087"]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def test_validation_dataset(val_sat_zarr_path, val_dataset_hyperparams):
sample_freq_mins=DATA_INTERVAL_SPACING_MINUTES,
)

# There are 14949 init times which all models must make predictions for
assert len(dataset) == 14949
# There are 3744 init times which all models must make predictions for
assert len(dataset) == 3744

X, y = dataset[0]

Expand Down

0 comments on commit 24a0f34

Please sign in to comment.