data_setup and mrdgp updates(Sulis)

alan-turing-institute · May 20, 2024 · e99e887 · e99e887
1 parent b7dc060
commit e99e887
Show file tree

Hide file tree

Showing 4 changed files with 231 additions and 193 deletions.
diff --git a/containers/cleanair/gpjax_models/gpjax_models/data/setup_data.py b/containers/cleanair/gpjax_models/gpjax_models/data/setup_data.py
@@ -1,9 +1,8 @@
 import pickle
 import numpy as np
 import pandas as pd
-from pathlib import Path
-
-from .normalise import normalise, space_norm, time_norm
+from stdata.ops import ensure_continuous_timeseries
+from stdata.utils import datetime_to_epoch
 
 
 def get_data_file_names(fold):
@@ -15,11 +14,10 @@ def get_data_file_names(fold):
 
 
 def get_X(df):
-    return np.array(df[["epoch", "lat", "lon", "value_200_park"]])
-
-
-def get_X_norm(df):
-    return np.array(df[["epoch_norm", "lat_norm", "lon_norm", "value_200_park_norm"]])
+    # return np.array(df[['epoch', 'lat', 'lon', 'value_100_total_a_road_length', 'value_100_total_a_road_primary_length', 'value_100_flat', 'value_100_max_canyon_ratio']])
+    return np.array(
+        df[["epoch", "lat", "lon", "value_200_total_a_road_primary_length"]]
+    )
 
 
 def get_Y(df):
@@ -31,27 +29,6 @@ def get_Y_sat(df):
     return np.array([df["NO2"].iloc[0]])[:, None]
 
 
-def get_Y_norm(df):
-    return np.array(df["NO2"])[:, None]
-
-
-def get_Y_sat_norm(df):
-    # Sat has one value per box_id, so we only need to return the first one
-    return np.array([df["NO2"].iloc[0]])[:, None]
-
-
-def process_data(df):
-    return get_X(df), get_Y(df)
-
-
-def process_data_test(df):
-    return get_X_norm(df)
-
-
-def process_data_norm(df):
-    return get_X_norm(df), get_Y_norm(df)
-
-
 def process_sat_data(train_data_sat):
     """process satellite to group by box id"""
     train_data_sat = train_data_sat.sort_values(by=["box_id", "lat", "lon"])
@@ -66,125 +43,156 @@ def process_sat_data(train_data_sat):
     return X_sat, Y_sat
 
 
+def process_data(df):
+    return get_X(df), get_Y(df)
+
+
+def clean_data(x_array, y_array):
+    """Remove nans and missing data for use in GPflow
+
+    Args:
+        x_array: N x D numpy array,
+        y_array: N x 1 numpy array
+
+    Returns:
+        x_array: Feature array cleaned of NaNs.
+        y_array: Target array cleaned of NaNs
+    """
+    idx = ~np.isnan(y_array[:, 0])
+    x_array = x_array[idx, :]
+    y_array = y_array[idx, :]
+
+    idx = np.isnan(x_array[:, :])
+    idx = [not (True in row) for row in idx]
+    x_array = x_array[idx, :]
+    y_array = y_array[idx, :]
+
+    return x_array, y_array
+
+
 def time_norm(X, wrt_to_X):
     """units will now be in days"""
     return (X - np.min(wrt_to_X)) / (60 * 60 * 24)
 
 
-def normalise(X, wrt_X):
-    return (X - np.mean(wrt_X, axis=0)) / np.std(wrt_X, axis=0)
-
-
 def space_norm(X, wrt_X):
     df_norm = normalise(X[:, 1:3], wrt_X[:, 1:3])
     return df_norm[:, 0], df_norm[:, 1]
 
 
-def norm_X(X: np.ndarray, wrt_X: np.ndarray) -> np.ndarray:
+def normalise(X, wrt_X):
+    return (X - np.mean(wrt_X, axis=0)) / np.std(wrt_X, axis=0)
+
+
+def norm_X(X, wrt_X):
     """
     First column is epoch, second is lat, third is lon, the rest are covariates
 
     Epochs:
         convert units to days, and shift to start at zero
 
     lat/lon:
-        convert units to kilometers, and shift to start at zero
+        z-standardised
 
     Covariates:
         z-standarised
     """
-
-    X_time = time_norm(X[:, 0], wrt_X[:, 0])
+    X_time = time_norm(X[..., 0], wrt_X[..., 0])
     X_eastings, X_northings = space_norm(X, wrt_X)
-
-    X_covariates = normalise(X[:, 3:], wrt_X[:, 3:])
-
+    X_covariates = normalise(X[..., 3:], wrt_X[..., 3:])
     X_norm = np.hstack(
         [X_time[:, None], X_eastings[:, None], X_northings[:, None], X_covariates]
     )
-
     return X_norm
 
 
-def load_test_data(fnames, data_root: Path) -> dict:
-    return pickle.load(open(data_root / fnames["test"], "rb"))
-
-
-def clean_data(x_array, y_array):
-    """Remove nans and missing data for use in GPflow
-
-    Args:
-        x_array: N x D numpy array,
-        y_array: N x 1 numpy array
-
-    Returns:
-        x_array: Feature array cleaned of NaNs.
-        y_array: Target array cleaned of NaNs
-    """
-    idx = ~np.isnan(y_array[:, 0])
-    x_array = x_array[idx, :]
-    y_array = y_array[idx, :]
+def generate_data(train_data, test_data):
+    train_laqn_df = train_data["laqn"]
+    train_sat_df = train_data["satellite"]
+    test_laqn_df = test_data["laqn"]
+    test_hexgrid_df = test_data["hexgrid"]
 
-    idx = np.isnan(x_array[:, :])
-    idx = [not (True in row) for row in idx]
+    train_laqn_df["measurement_start_utc"] = pd.to_datetime(
+        train_laqn_df["measurement_start_utc"]
+    )
+    test_laqn_df["measurement_start_utc"] = pd.to_datetime(
+        test_laqn_df["measurement_start_utc"]
+    )
+    test_hexgrid_df["measurement_start_utc"] = pd.to_datetime(
+        test_hexgrid_df["measurement_start_utc"]
+    )
 
-    x_array = x_array[idx, :]
-    y_array = y_array[idx, :]
+    test_hexgrid_sites = test_hexgrid_df.drop_duplicates(subset="point_id")
 
-    return x_array, y_array
+    test_hexgrid_df = ensure_continuous_timeseries(
+        test_hexgrid_sites,
+        dt_col="measurement_start_utc",
+        id_col="point_id",
+        freq="H",
+        min_dt=train_laqn_df["measurement_start_utc"].min(),
+        max_dt=test_laqn_df["measurement_start_utc"].max(),
+    )
+    test_hexgrid_df = test_hexgrid_df[["measurement_start_utc", "point_id"]].merge(
+        test_hexgrid_sites.drop(columns="measurement_start_utc"), on=["point_id"]
+    )
 
+    test_hexgrid_df["epoch"] = datetime_to_epoch(
+        test_hexgrid_df["measurement_start_utc"]
+    )
 
-def generate_data(df):
-    # load cleaned data pickle
-    # collect training arrays
-    train_X, train_Y = process_data(df)
-    # Normalize X, laqn_X is used as the reference
-    train_X_norm = norm_X(train_X, train_X)
-    x_train, y_train = clean_data(train_X_norm, train_Y)
+    train_laqn_X, train_laqn_Y = process_data(train_laqn_df)
+    train_sat_X, train_sat_Y = process_sat_data(train_sat_df)
+
+    test_laqn_X = get_X(test_laqn_df)
+    test_hexgrid_X = get_X(test_hexgrid_df)
+    breakpoint()
+    # Remove NaN data
+    train_laqn_X, train_laqn_Y = clean_data(train_laqn_X, train_laqn_Y)
+    train_sat_X, train_sat_Y = clean_data(train_sat_X, train_sat_Y)
+    test_laqn_X, _ = clean_data(
+        test_laqn_X, np.zeros((test_laqn_X.shape[0], 1))
+    )  # Test data has no Y
+    test_hexgrid_X, _ = clean_data(
+        test_hexgrid_X, np.zeros((test_hexgrid_X.shape[0], 1))
+    )  # Test data has no Y
+
+    train_laqn_X_norm = norm_X(train_laqn_X, train_laqn_X)
+    train_sat_X_norm_list = [
+        norm_X(train_sat_X[i], train_laqn_X) for i in range(train_sat_X.shape[0])
+    ]
+    train_sat_X_norm = np.array(train_sat_X_norm_list)
+
+    test_laqn_X_norm = norm_X(test_laqn_X, train_laqn_X)
+    test_hexgrid_X_norm = norm_X(test_hexgrid_X, train_laqn_X)
+
+    print("======")
+    print(
+        f"LAQN train: {train_laqn_X_norm.shape}, {train_laqn_X.shape}, {train_laqn_Y.shape}"
+    )
+    print(
+        f"SAT train: {train_sat_X_norm.shape}, {train_sat_X.shape}, {train_sat_Y.shape}"
+    )
+    print(f"LAQN test: {test_laqn_X_norm.shape}, {test_laqn_X.shape}, -")
+    print(f"HEXGRID test: {test_hexgrid_X_norm.shape}, {test_hexgrid_X.shape}, -")
+    print("======")
 
-    # Create the train_dict
     train_dict = {
-        "X": x_train,
-        "Y": y_train,
+        "laqn": {"X": train_laqn_X_norm, "Y": train_laqn_Y},
+        "sat": {"X": train_sat_X_norm, "Y": train_sat_Y},
     }
 
-    return train_dict
-
-
-def generate_data_norm(df):
-    # load cleaned data pickle
-    # collect training arrays
-    train_X, train_Y = process_data_norm(df)
-
-    # Create the train_dict
-    train_dict = {
-        "X": train_X,
-        "Y": train_Y,
+    test_dict = {
+        "laqn": {"X": test_laqn_X_norm, "Y": None},
+        "hexgrid": {"X": test_hexgrid_X_norm, "Y": None},
     }
 
-    return train_dict
-
-
-def generate_data_norm_sat(df):
-    # load cleaned data pickle
-    # collect training arrays
-    train_X, train_Y = process_sat_data(df)
-
-    # Create the train_dict
-    train_dict = {
-        "X": train_X,
-        "Y": train_Y,
+    meta_dict = {
+        "train": {"laqn": {"df": train_laqn_df}, "sat": {"df": train_sat_df}},
+        "test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}},
     }
 
-    return train_dict
-
-
-def generate_data_test(df):
-    # load cleaned data pickle
-    # collect training arrays
-    test_X = process_data_test(df)
-
-    # Create the train_dict
-    test_dict = {"X": test_X, "Y": None}
+    with open("raw_data_new_datacreation.pkl", "wb") as file:
+        pickle.dump(meta_dict, file)
 
-    return test_dict
+    breakpoint()
+    return train_dict, test_dict
diff --git a/containers/cleanair/gpjax_models/gpjax_models/models/stgp_mrdgp.py b/containers/cleanair/gpjax_models/gpjax_models/models/stgp_mrdgp.py
@@ -23,6 +23,11 @@
 from stgp.trainers import NatGradTrainer, GradDescentTrainer
 from stgp.transforms import Aggregate, Independent
 
+results_path = "/Users/suedaciftci/projects/clean-air/clean-air-infrastructure/containers/cleanair/gpjax_models/data/mrdgp_production_results"
+# Create the directory if it doesn't exist
+if not os.path.exists(results_path):
+    os.makedirs(results_path)
+
 
 class STGP_MRDGP:
     def __init__(
@@ -100,8 +105,12 @@ def get_laqn_sat(X_sat, Y_sat, X_laqn, Y_laqn):
 
             latent_gp = GP(  # prior
                 sparsity=Z,
-                kernel=ScaleKernel(DeepRBF(parent=latent_m1, lengthscale=[0.1]))
-                * RBF(input_dim=3, lengthscales=[0.1, 0.1, 0.1], active_dims=[1, 2, 3]),
+                kernel=ScaleKernel(DeepRBF(parent=latent_m1, lengthscale=[1]))
+                * RBF(
+                    input_dim=4,
+                    lengthscales=[0.1, 0.1, 0.1, 0.1],
+                    active_dims=[1, 2, 3, 4],
+                ),
             )
 
             prior = Independent([latent_gp])
@@ -163,7 +172,8 @@ def train_laqn_sat(m, num_epochs):
                 sat_natgrad.train(0.1, 1)
                 laqn_natgrad.train(0.1, 1)
 
-            with open(os.path.join("joint_lc.pkl"), "wb") as file:
+            joint_elbo_path = os.path.join(results_path, "joint_lc_1_500ip.pkl")
+            with open(joint_elbo_path, "wb") as file:
                 pickle.dump(lc_arr, file)
 
             return lc_arr
@@ -226,11 +236,10 @@ def _reshape_pred(XS):
         results = predict_laqn_sat(pred_laqn_data, pred_sat_data, m)
 
         # Save the loss values to a pickle file
-        with open(
-            os.path.join("training_loss_mrdgp.pkl"),
-            "wb",
-        ) as file:
+        training_path = os.path.join(results_path, "training_loss_mrdgp_1_500ip.pkl")
+        with open(training_path, "wb") as file:
             pickle.dump(loss_values, file)
-
-        with open(os.path.join("predictions_mrdgp.pkl"), "wb") as file:
+        # All prediction path
+        prediction_path = os.path.join(results_path, "predictions_mrdgp_1_500ip.pkl")
+        with open(prediction_path, "wb") as file:
             pickle.dump(results, file)