diff --git a/containers/cleanair/gpjax_models/gpjax_models/models/metrics.py b/containers/cleanair/gpjax_models/gpjax_models/models/metrics.py new file mode 100644 index 000000000..640df6157 --- /dev/null +++ b/containers/cleanair/gpjax_models/gpjax_models/models/metrics.py @@ -0,0 +1,132 @@ +import pickle, csv +import numpy as np +import pandas as pd +import geopandas as gpd +from pathlib import Path +import matplotlib.pyplot as plt +import os +from stdata.vis.spacetime import SpaceTimeVisualise +import geopandas as gpd +from stdata.utils import to_gdf + +from pysal.explore import esda # Exploratory Spatial analytics +from pysal.lib import weights # Spatial weights + +directory_path = Path("/Users/oliverhamelijnck/Downloads/dataset/") + +# Create the directory if it doesn't exist +if not os.path.exists(directory_path): + os.makedirs(directory_path) + + +def load_data(root): + with open( + str(directory_path / "training_dataset.pkl"), + "rb", + ) as file: + training_data = pd.read_pickle(file) + with open( + str(directory_path / "test_dataset.pkl"), + "rb", + ) as file: + testing_data = pd.read_pickle(file) + # Load raw data using pickle + with open( + directory_path / "raw_data_3.pkl", + "rb", + ) as file: + raw_data = pd.read_pickle(file) + + with open( + directory_path / "test_true_dataset.pkl", + "rb", + ) as file: + true_y = pd.read_pickle(file) + date_ranges = [ + ("2023-11-16 00:00:00+00:00", "2023-11-16 23:00:00+00:00"), # thur + ("2023-11-17 00:00:00+00:00", "2023-11-17 23:00:00+00:00"), + ("2023-11-18 00:00:00+00:00", "2023-11-18 23:00:00+00:00"), + ("2023-11-19 00:00:00+00:00", "2023-11-19 23:00:00+00:00"), + ("2023-11-20 00:00:00+00:00", "2023-11-20 23:00:00+00:00"), + ("2023-11-21 00:00:00+00:00", "2023-11-21 23:00:00+00:00"), + ("2023-11-22 00:00:00+00:00", "2023-11-22 23:00:00+00:00"), + ("2023-11-23 00:00:00+00:00", "2023-11-23 23:00:00+00:00"), # Thur + ("2023-11-24 00:00:00+00:00", "2023-11-24 23:00:00+00:00"), # fri + ("2023-11-25 00:00:00+00:00", "2023-11-25 23:00:00+00:00"), + ] + + # Convert the 'measurement_start_utc' column to datetime if it's not already + true_y["laqn"]["measurement_start_utc"] = pd.to_datetime( + true_y["laqn"]["measurement_start_utc"] + ) + + # Filter the laqn DataFrame for each specified date range + filtered_data = [] + for start_date, end_date in date_ranges: + day_gt = true_y["laqn"][ + (true_y["laqn"]["measurement_start_utc"] >= start_date) + & (true_y["laqn"]["measurement_start_utc"] <= end_date) + ] + filtered_data.append(day_gt) + + # Assign each day's data to a separate variable for convenience + day_1_gt, day_2_gt, day_3_gt, day_4_gt, day_5_gt, day_6_gt, day_7_gt = filtered_data + + return training_data, testing_data, raw_data, day_3_gt + + +def load_results(root): + with open(str(root / "predictions_mrdgp_3.pkl"), "rb") as file: + results = pd.read_pickle(file) + return results + + +def fix_df_columns_dropna(df): + df = df.rename(columns={"point_id": "id", "datetime": "measurement_start_utc"}) + return df.dropna(subset=["NO2"]) + + +def fix_df_columns(df): + return df.rename(columns={"point_id": "id", "datetime": "measurement_start_utc"}) + + +if __name__ == "__main__": + data_root = directory_path + + training_data, testing_data, raw_data, day_3_gt = load_data(data_root) + day_3_gt = day_3_gt.reset_index(drop=True) + train_laqn_df = fix_df_columns(raw_data["train"]["laqn"]["df"]) + test_laqn_df = fix_df_columns(raw_data["test"]["laqn"]["df"]) + true_val = fix_df_columns(day_3_gt) + + # Load results + results = load_results(data_root) + train_laqn_df["pred"] = results["predictions"]["train_laqn"]["mu"][0].T + train_laqn_df["var"] = np.squeeze(results["predictions"]["train_laqn"]["var"][0]) + train_laqn_df["observed"] = train_laqn_df["NO2"] + + train_laqn_df["NO2"] = train_laqn_df["NO2"] + + test_laqn_df["pred"] = results["predictions"]["test_laqn"]["mu"][0].T + test_laqn_df["var"] = np.squeeze(results["predictions"]["test_laqn"]["var"][0]) + test_laqn_df["observed"] = day_3_gt["NO2"] + + test_laqn_df["residual"] = test_laqn_df["observed"] - test_laqn_df["pred"] + + test_df = to_gdf( + test_laqn_df[test_laqn_df["epoch"] == test_laqn_df["epoch"].iloc[0]].copy() + ) + test_df = test_df[test_df["residual"].notna()].copy() + + # Generate W from the GeoDataFrame + w = weights.distance.KNN.from_dataframe(test_df, k=8) + # Row-standardization + w.transform = "R" + + test_df["w_residual"] = weights.lag_spatial(w, test_df["residual"]) + + test_df["residual_std"] = test_df["residual"] - test_df["residual"].mean() + test_df["w_residual_std"] = weights.lag_spatial(w, test_df["residual_std"]) + + lisa = esda.moran.Moran_Local(test_df["residual"], w) + breakpoint() diff --git a/containers/cleanair/gpjax_models/gpjax_models/models/stgp_mrdgp.py b/containers/cleanair/gpjax_models/gpjax_models/models/stgp_mrdgp.py index a2246a43c..8ca9e141b 100644 --- a/containers/cleanair/gpjax_models/gpjax_models/models/stgp_mrdgp.py +++ b/containers/cleanair/gpjax_models/gpjax_models/models/stgp_mrdgp.py @@ -65,7 +65,7 @@ def get_aggregated_sat_model(X_sat, Y_sat): Z = FullSparsity(Z=kmeans2(jnp.vstack(X_sat), self.M, minit="points")[0]) latent_gp = GP( sparsity=Z, - kernel=ScaleKernel(RBF(input_dim=D, lengthscales=[0.1, 0.1, 0.1, 0.1])), + kernel=ScaleKernel(RBF(input_dim=D, lengthscales=[5.0, 0.5, 0.5, 0.5])), ) prior = Aggregate(Independent([latent_gp])) model = GP( @@ -83,7 +83,7 @@ def get_laqn_sat_model(X_sat, Y_sat, X_laqn, Y_laqn): kernel=ScaleKernel(DeepRBF(parent=latent_m1, lengthscale=[0.1])) * RBF( input_dim=4, - lengthscales=[0.1, 0.1, 0.1, 0.1], + lengthscales=[0.1, 0.5, 0.5, 0.1], active_dims=[1, 2, 3, 4], ), ) @@ -102,34 +102,75 @@ def train_model(model, num_epochs): sat_natgrad = NatGradTrainer(sat_model) laqn_natgrad = NatGradTrainer(laqn_model) + for q in range(len(laqn_model.approximate_posterior.approx_posteriors)): laqn_model.approximate_posterior.approx_posteriors[q]._S_chol.fix() laqn_model.approximate_posterior.approx_posteriors[q]._m.fix() for q in range(len(sat_model.approximate_posterior.approx_posteriors)): sat_model.approximate_posterior.approx_posteriors[q]._S_chol.fix() sat_model.approximate_posterior.approx_posteriors[q]._m.fix() + sat_grad = GradDescentTrainer(sat_model, objax.optimizer.Adam) laqn_grad = GradDescentTrainer(laqn_model, objax.optimizer.Adam) joint_grad = GradDescentTrainer(combined_model, objax.optimizer.Adam) + sat_model.likelihood.fix() + laqn_model.likelihood.fix() + sat_lik_fixed_grad = GradDescentTrainer(sat_model, objax.optimizer.Adam) + laqn_lik_fixed_grad = GradDescentTrainer(laqn_model, objax.optimizer.Adam) + joint_lik_fixed_grad = GradDescentTrainer( + combined_model, objax.optimizer.Adam + ) + + sat_model.likelihood.release() + laqn_model.likelihood.release() + loss_curve = [] + + # pretrain/initial the approximate posterior with natural gradients sat_natgrad.train(1.0, 1) laqn_natgrad.train(1.0, 1) # Pretrain SAT model # pretrain sat + + pretrain_lik_fixed_epochs = int(self.pretrain_epochs * 0.4) + pretrain_lik_released_epochs = ( + self.pretrain_epochs - pretrain_lik_fixed_epochs + ) + + num_lik_fixed_epochs = int(self.num_epochs * 0.4) + num_lik_released_epochs = self.num_epochs - num_lik_fixed_epochs + + # pretrain satellite + # for 40% of epochs use likelihood fixed print("Pretraining sat") - for i in trange(self.pretrain_epochs): + for i in trange(pretrain_lik_fixed_epochs): + lc_i, _ = sat_lik_fixed_grad.train(0.01, 1) + sat_natgrad.train(0.1, 1) + loss_curve.append(lc_i) + + # for 60% of epochs use likelihood released + for i in trange(pretrain_lik_released_epochs): lc_i, _ = sat_grad.train(0.01, 1) sat_natgrad.train(0.1, 1) loss_curve.append(lc_i) + # pretrain laqn print("Pretraining laqn") - for i in trange(self.pretrain_epochs): + + for i in trange(pretrain_lik_fixed_epochs): + lc_i, _ = laqn_lik_fixed_grad.train(0.01, 1) + laqn_natgrad.train(0.1, 1) + loss_curve.append(lc_i) + + # for 60% of epochs use likelihood released + for i in trange(pretrain_lik_released_epochs): lc_i, _ = laqn_grad.train(0.01, 1) laqn_natgrad.train(0.1, 1) loss_curve.append(lc_i) + # TODO: not sure if this should have likleihood fixed or not print("Joint training") for i in trange(self.num_epochs): lc_i, _ = joint_grad.train(0.01, 1) @@ -137,7 +178,7 @@ def train_model(model, num_epochs): sat_natgrad.train(0.1, 1) laqn_natgrad.train(0.1, 1) - with open(os.path.join("joint_loss_curve_7.pkl"), "wb") as file: + with open(os.path.join("joint_loss_curve_hold_len5.pkl"), "wb") as file: pickle.dump(loss_curve, file) return loss_curve @@ -198,12 +239,14 @@ def _reshape_pred(X): # Save predictions with open( - os.path.join(self.results_path, "predictions_mrdgp.pkl"), "wb" + os.path.join(self.results_path, "predictions_mrdgp_hold_len5.pkl"), "wb" ) as file: pickle.dump(results, file) # Save inducing points - with open(os.path.join(self.results_path, "inducing_points.pkl"), "wb") as file: + with open( + os.path.join(self.results_path, "inducing_points_hold_len5.pkl"), "wb" + ) as file: pickle.dump(inducing_points, file) # Print model and inducing points diff --git a/containers/cleanair/gpjax_models/gpjax_models/models/stgp_svgp.py b/containers/cleanair/gpjax_models/gpjax_models/models/stgp_svgp.py index ccfc9ccdf..a5f274125 100644 --- a/containers/cleanair/gpjax_models/gpjax_models/models/stgp_svgp.py +++ b/containers/cleanair/gpjax_models/gpjax_models/models/stgp_svgp.py @@ -24,7 +24,7 @@ class STGP_SVGP: def __init__( self, - results_path, # Non-default argument moved to the first position + results_path: str, # Ensure this is passed as a string M: int = 100, batch_size: int = 100, num_epochs: int = 10, @@ -92,22 +92,18 @@ def get_laqn_svgp(X_laqn, Y_laqn): def train_laqn(num_epoch, m_laqn): laqn_natgrad = NatGradTrainer(m_laqn) - for q in range(len(m_laqn.approximate_posterior.approx_posteriors)): m_laqn.approximate_posterior.approx_posteriors[q]._S_chol.fix() m_laqn.approximate_posterior.approx_posteriors[q]._m.fix() - - joint_grad = GradDescentTrainer(m_laqn, objax.optimizer.Adam) + laqn_grad = GradDescentTrainer(m_laqn, objax.optimizer.Adam) lc_arr = [] laqn_natgrad.train(1.0, 1) for i in trange(num_epoch): - lc_i, _ = joint_grad.train(0.01, 1) + lc_i, _ = laqn_grad.train(0.01, 1) lc_arr.append(lc_i) - laqn_natgrad.train(1.0, 1) - return lc_arr def predict_laqn_svgp(pred_data, m) -> dict: @@ -149,21 +145,15 @@ def pred_wrapper(XS): pickle.dump(results, file) # Save inducing points - inducing_points = m.prior[0].sparsity.inducing_locations with open( - os.path.join(self.results_path, "inducing_points_svgp_laqn__.pkl"), "wb" + os.path.join(self.results_path, "loss_values_svgp_laqn__.pkl"), "wb" ) as file: - pickle.dump(inducing_points, file) + pickle.dump(loss_values, file) return loss_values class STGP_SVGP_SAT: - def __init__( - self, - M, - batch_size, - num_epochs, - ): + def __init__(self, M, batch_size, num_epochs, results_path): """ Initialize the JAX-based Air Quality Gaussian Process Model. @@ -175,6 +165,7 @@ def __init__( self.M = M self.batch_size = batch_size self.num_epochs = num_epochs + self.results_path = results_path def fit( self, x_sat: np.ndarray, y_sat: np.ndarray, pred_laqn_data, pred_sat_data @@ -286,10 +277,12 @@ def sat_pred(XS): results = predict_laqn_sat_from_sat_only(pred_laqn_data, pred_sat_data, m) with open( - os.path.join("training_loss_svgp_sat.pkl"), + os.path.join(self.results_path, "training_svgp_sat.pkl"), "wb", ) as file: pickle.dump(loss_values, file) print(results["metrics"]) - with open("predictions_svgp.pkl", "wb") as file: + with open( + os.path.join(self.results_path, "predictions_svgp_sat.pkl"), "wb" + ) as file: pickle.dump(results, file) diff --git a/containers/cleanair/gpjax_models/gpjax_models/models/vis.py b/containers/cleanair/gpjax_models/gpjax_models/models/vis.py new file mode 100644 index 000000000..07adfc696 --- /dev/null +++ b/containers/cleanair/gpjax_models/gpjax_models/models/vis.py @@ -0,0 +1,153 @@ +import pickle, csv +import numpy as np +import pandas as pd +import geopandas as gpd +from pathlib import Path +import matplotlib.pyplot as plt +import os +from stdata.vis.spacetime import SpaceTimeVisualise + + +directory_path = Path('/Users/oliverhamelijnck/Downloads/dataset/') + +# Create the directory if it doesn't exist +if not os.path.exists(directory_path): + os.makedirs(directory_path) + + +def load_data(root): + with open( + str( + directory_path / "training_dataset.pkl" + ), + "rb", + ) as file: + training_data = pd.read_pickle(file) + with open( + str( + directory_path/ "test_dataset.pkl" + ), + "rb", + ) as file: + testing_data = pd.read_pickle(file) + # Load raw data using pickle + with open( + directory_path / "raw_data_3.pkl", + "rb", + ) as file: + raw_data = pd.read_pickle(file) + + with open( + directory_path / "test_true_dataset.pkl", + "rb", + ) as file: + true_y = pd.read_pickle(file) + date_ranges = [ + ("2023-12-01 00:00:00+00:00", "2023-12-01 23:00:00+00:00"), + ("2023-12-02 00:00:00+00:00", "2023-12-02 23:00:00+00:00"), + ("2023-12-03 00:00:00+00:00", "2023-12-03 23:00:00+00:00"), + ("2023-12-04 00:00:00+00:00", "2023-12-04 23:00:00+00:00"), + ("2023-12-05 00:00:00+00:00", "2023-12-05 23:00:00+00:00"), + ("2023-12-06 00:00:00+00:00", "2023-12-06 23:00:00+00:00"), + ("2023-12-07 00:00:00+00:00", "2023-12-07 23:00:00+00:00"), + ] + + # Convert the 'measurement_start_utc' column to datetime if it's not already + true_y["laqn"]["measurement_start_utc"] = pd.to_datetime( + true_y["laqn"]["measurement_start_utc"] + ) + + # Filter the laqn DataFrame for each specified date range + filtered_data = [] + for start_date, end_date in date_ranges: + day_gt = true_y["laqn"][ + (true_y["laqn"]["measurement_start_utc"] >= start_date) + & (true_y["laqn"]["measurement_start_utc"] <= end_date) + ] + filtered_data.append(day_gt) + + # Assign each day's data to a separate variable for convenience + day_1_gt, day_2_gt, day_3_gt, day_4_gt, day_5_gt, day_6_gt, day_7_gt = filtered_data + + return training_data, testing_data, raw_data, day_3_gt + + +def load_results(root): + with open( + str(root / "predictions_mrdgp_3.pkl"), "rb" + ) as file: + results = pd.read_pickle(file) + return results + + +def fix_df_columns_dropna(df): + df = df.rename(columns={"point_id": "id", "datetime": "measurement_start_utc"}) + return df.dropna(subset=["NO2"]) + + +def fix_df_columns(df): + return df.rename(columns={"point_id": "id", "datetime": "measurement_start_utc"}) + + +if __name__ == "__main__": + data_root = directory_path + + training_data, testing_data, raw_data, day_3_gt = load_data(data_root) + day_3_gt = day_3_gt.reset_index(drop=True) + train_laqn_df = fix_df_columns(raw_data["train"]["laqn"]["df"]) + test_laqn_df = fix_df_columns(raw_data["test"]["laqn"]["df"]) + true_val = fix_df_columns(day_3_gt) + # test_laqn_true_values = true_y + hexgrid_df = fix_df_columns(raw_data["test"]["hexgrid"]["df"]) + + # Load results + results = load_results(data_root) + train_laqn_df["pred"] = results["predictions"]["train_laqn"]["mu"][0].T + train_laqn_df["var"] = np.squeeze(results["predictions"]["train_laqn"]["var"][0]) + train_laqn_df["observed"] = train_laqn_df["NO2"] + + train_laqn_df["NO2"] = train_laqn_df["NO2"] + + test_laqn_df["pred"] = results["predictions"]["test_laqn"]["mu"][0].T + test_laqn_df["var"] = np.squeeze(results["predictions"]["test_laqn"]["var"][0]) + test_laqn_df["observed"] = day_3_gt["NO2"] + + # train_laqn_df["pred"] = train_laqn_df["traffic"] + # test_laqn_df["pred"] = test_laqn_df["traffic"] + + # train_laqn_df["measurement_start_utc"] = pd.to_datetime( + # train_laqn_df["measurement_start_utc"] + # ) + train_end = train_laqn_df["epoch"].max() + laqn_df = pd.concat([train_laqn_df, test_laqn_df]) + if False: + #'geom' is the column containing Shapely Point geometries + hexgrid_df["geom"] = gpd.points_from_xy(hexgrid_df["lon"], hexgrid_df["lat"]) + + # Buffer each Point geometry by 0.002 + hexgrid_df["geom"] = hexgrid_df["geom"].apply(lambda point: point.buffer(0.002)) + + # Create a GeoDataFrame using the 'geom' column + hexgrid_gdf = gpd.GeoDataFrame(hexgrid_df, geometry="geom") + hexgrid_df["pred"] = results["predictions"]["hexgrid"]["mu"][0].T + # hexgrid_df["pred"] = hexgrid_df["traffic"] + hexgrid_df["var"] = np.squeeze(results["predictions"]["hexgrid"]["var"][0]) + else: + hexgrid_df = None + + sat_df = fix_df_columns(raw_data["train"]['sat']['df']) + # TODO: NEED TO CHECK!! this should match is handling the satllite data + sat_df = sat_df[['lon', 'lat', 'NO2', 'epoch', 'box_id']].groupby(['epoch', 'box_id']).mean().reset_index() + # copy predictions + sat_df['pred'] = results["predictions"]['sat']['mu'][0] + sat_df['var'] = results["predictions"]['sat']['var'][0] + sat_df['observed'] = sat_df['NO2'] + + laqn_df['residual'] = laqn_df['observed'] - laqn_df['pred'] + laqn_df['pred'] = laqn_df['residual'] + + vis_obj = SpaceTimeVisualise( laqn_df, hexgrid_df, sat_df=sat_df, geopandas_flag=True, test_start=train_end) + #vis_obj = SpaceTimeVisualise( laqn_df, hexgrid_df, geopandas_flag=True, test_start=train_end) + + # Show the visualization + vis_obj.show()