Skip to content

Commit

Permalink
Merge branch 'redesing_system_add_sat_vis' into redesing_system
Browse files Browse the repository at this point in the history
  • Loading branch information
Sueda Ciftci committed Dec 17, 2024
2 parents 65fc46b + d79da56 commit d3da0d7
Show file tree
Hide file tree
Showing 4 changed files with 346 additions and 25 deletions.
132 changes: 132 additions & 0 deletions containers/cleanair/gpjax_models/gpjax_models/models/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import pickle, csv
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
import matplotlib.pyplot as plt
import os
from stdata.vis.spacetime import SpaceTimeVisualise
import geopandas as gpd
from stdata.utils import to_gdf

from pysal.explore import esda # Exploratory Spatial analytics
from pysal.lib import weights # Spatial weights

directory_path = Path("/Users/oliverhamelijnck/Downloads/dataset/")

# Create the directory if it doesn't exist
if not os.path.exists(directory_path):
os.makedirs(directory_path)


def load_data(root):
with open(
str(directory_path / "training_dataset.pkl"),
"rb",
) as file:
training_data = pd.read_pickle(file)
with open(
str(directory_path / "test_dataset.pkl"),
"rb",
) as file:
testing_data = pd.read_pickle(file)
# Load raw data using pickle
with open(
directory_path / "raw_data_3.pkl",
"rb",
) as file:
raw_data = pd.read_pickle(file)

with open(
directory_path / "test_true_dataset.pkl",
"rb",
) as file:
true_y = pd.read_pickle(file)
date_ranges = [
("2023-11-16 00:00:00+00:00", "2023-11-16 23:00:00+00:00"), # thur
("2023-11-17 00:00:00+00:00", "2023-11-17 23:00:00+00:00"),
("2023-11-18 00:00:00+00:00", "2023-11-18 23:00:00+00:00"),
("2023-11-19 00:00:00+00:00", "2023-11-19 23:00:00+00:00"),
("2023-11-20 00:00:00+00:00", "2023-11-20 23:00:00+00:00"),
("2023-11-21 00:00:00+00:00", "2023-11-21 23:00:00+00:00"),
("2023-11-22 00:00:00+00:00", "2023-11-22 23:00:00+00:00"),
("2023-11-23 00:00:00+00:00", "2023-11-23 23:00:00+00:00"), # Thur
("2023-11-24 00:00:00+00:00", "2023-11-24 23:00:00+00:00"), # fri
("2023-11-25 00:00:00+00:00", "2023-11-25 23:00:00+00:00"),
]

# Convert the 'measurement_start_utc' column to datetime if it's not already
true_y["laqn"]["measurement_start_utc"] = pd.to_datetime(
true_y["laqn"]["measurement_start_utc"]
)

# Filter the laqn DataFrame for each specified date range
filtered_data = []
for start_date, end_date in date_ranges:
day_gt = true_y["laqn"][
(true_y["laqn"]["measurement_start_utc"] >= start_date)
& (true_y["laqn"]["measurement_start_utc"] <= end_date)
]
filtered_data.append(day_gt)

# Assign each day's data to a separate variable for convenience
day_1_gt, day_2_gt, day_3_gt, day_4_gt, day_5_gt, day_6_gt, day_7_gt = filtered_data

return training_data, testing_data, raw_data, day_3_gt


def load_results(root):
with open(str(root / "predictions_mrdgp_3.pkl"), "rb") as file:
results = pd.read_pickle(file)
return results


def fix_df_columns_dropna(df):
df = df.rename(columns={"point_id": "id", "datetime": "measurement_start_utc"})
return df.dropna(subset=["NO2"])


def fix_df_columns(df):
return df.rename(columns={"point_id": "id", "datetime": "measurement_start_utc"})


if __name__ == "__main__":
data_root = directory_path

training_data, testing_data, raw_data, day_3_gt = load_data(data_root)
day_3_gt = day_3_gt.reset_index(drop=True)
train_laqn_df = fix_df_columns(raw_data["train"]["laqn"]["df"])
test_laqn_df = fix_df_columns(raw_data["test"]["laqn"]["df"])
true_val = fix_df_columns(day_3_gt)

# Load results
results = load_results(data_root)
train_laqn_df["pred"] = results["predictions"]["train_laqn"]["mu"][0].T
train_laqn_df["var"] = np.squeeze(results["predictions"]["train_laqn"]["var"][0])
train_laqn_df["observed"] = train_laqn_df["NO2"]

train_laqn_df["NO2"] = train_laqn_df["NO2"]

test_laqn_df["pred"] = results["predictions"]["test_laqn"]["mu"][0].T
test_laqn_df["var"] = np.squeeze(results["predictions"]["test_laqn"]["var"][0])
test_laqn_df["observed"] = day_3_gt["NO2"]

test_laqn_df["residual"] = test_laqn_df["observed"] - test_laqn_df["pred"]

test_df = to_gdf(
test_laqn_df[test_laqn_df["epoch"] == test_laqn_df["epoch"].iloc[0]].copy()
)
test_df = test_df[test_df["residual"].notna()].copy()

# Generate W from the GeoDataFrame
w = weights.distance.KNN.from_dataframe(test_df, k=8)
# Row-standardization
w.transform = "R"

test_df["w_residual"] = weights.lag_spatial(w, test_df["residual"])

test_df["residual_std"] = test_df["residual"] - test_df["residual"].mean()
test_df["w_residual_std"] = weights.lag_spatial(w, test_df["residual_std"])

lisa = esda.moran.Moran_Local(test_df["residual"], w)
breakpoint()
57 changes: 50 additions & 7 deletions containers/cleanair/gpjax_models/gpjax_models/models/stgp_mrdgp.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def get_aggregated_sat_model(X_sat, Y_sat):
Z = FullSparsity(Z=kmeans2(jnp.vstack(X_sat), self.M, minit="points")[0])
latent_gp = GP(
sparsity=Z,
kernel=ScaleKernel(RBF(input_dim=D, lengthscales=[0.1, 0.1, 0.1, 0.1])),
kernel=ScaleKernel(RBF(input_dim=D, lengthscales=[5.0, 0.5, 0.5, 0.5])),
)
prior = Aggregate(Independent([latent_gp]))
model = GP(
Expand All @@ -83,7 +83,7 @@ def get_laqn_sat_model(X_sat, Y_sat, X_laqn, Y_laqn):
kernel=ScaleKernel(DeepRBF(parent=latent_m1, lengthscale=[0.1]))
* RBF(
input_dim=4,
lengthscales=[0.1, 0.1, 0.1, 0.1],
lengthscales=[0.1, 0.5, 0.5, 0.1],
active_dims=[1, 2, 3, 4],
),
)
Expand All @@ -102,42 +102,83 @@ def train_model(model, num_epochs):

sat_natgrad = NatGradTrainer(sat_model)
laqn_natgrad = NatGradTrainer(laqn_model)

for q in range(len(laqn_model.approximate_posterior.approx_posteriors)):
laqn_model.approximate_posterior.approx_posteriors[q]._S_chol.fix()
laqn_model.approximate_posterior.approx_posteriors[q]._m.fix()
for q in range(len(sat_model.approximate_posterior.approx_posteriors)):
sat_model.approximate_posterior.approx_posteriors[q]._S_chol.fix()
sat_model.approximate_posterior.approx_posteriors[q]._m.fix()

sat_grad = GradDescentTrainer(sat_model, objax.optimizer.Adam)
laqn_grad = GradDescentTrainer(laqn_model, objax.optimizer.Adam)
joint_grad = GradDescentTrainer(combined_model, objax.optimizer.Adam)

sat_model.likelihood.fix()
laqn_model.likelihood.fix()
sat_lik_fixed_grad = GradDescentTrainer(sat_model, objax.optimizer.Adam)
laqn_lik_fixed_grad = GradDescentTrainer(laqn_model, objax.optimizer.Adam)
joint_lik_fixed_grad = GradDescentTrainer(
combined_model, objax.optimizer.Adam
)

sat_model.likelihood.release()
laqn_model.likelihood.release()

loss_curve = []

# pretrain/initial the approximate posterior with natural gradients
sat_natgrad.train(1.0, 1)
laqn_natgrad.train(1.0, 1)

# Pretrain SAT model
# pretrain sat

pretrain_lik_fixed_epochs = int(self.pretrain_epochs * 0.4)
pretrain_lik_released_epochs = (
self.pretrain_epochs - pretrain_lik_fixed_epochs
)

num_lik_fixed_epochs = int(self.num_epochs * 0.4)
num_lik_released_epochs = self.num_epochs - num_lik_fixed_epochs

# pretrain satellite
# for 40% of epochs use likelihood fixed
print("Pretraining sat")
for i in trange(self.pretrain_epochs):
for i in trange(pretrain_lik_fixed_epochs):
lc_i, _ = sat_lik_fixed_grad.train(0.01, 1)
sat_natgrad.train(0.1, 1)
loss_curve.append(lc_i)

# for 60% of epochs use likelihood released
for i in trange(pretrain_lik_released_epochs):
lc_i, _ = sat_grad.train(0.01, 1)
sat_natgrad.train(0.1, 1)
loss_curve.append(lc_i)

# pretrain laqn
print("Pretraining laqn")
for i in trange(self.pretrain_epochs):

for i in trange(pretrain_lik_fixed_epochs):
lc_i, _ = laqn_lik_fixed_grad.train(0.01, 1)
laqn_natgrad.train(0.1, 1)
loss_curve.append(lc_i)

# for 60% of epochs use likelihood released
for i in trange(pretrain_lik_released_epochs):
lc_i, _ = laqn_grad.train(0.01, 1)
laqn_natgrad.train(0.1, 1)
loss_curve.append(lc_i)

# TODO: not sure if this should have likleihood fixed or not
print("Joint training")
for i in trange(self.num_epochs):
lc_i, _ = joint_grad.train(0.01, 1)
loss_curve.append(lc_i)
sat_natgrad.train(0.1, 1)
laqn_natgrad.train(0.1, 1)

with open(os.path.join("joint_loss_curve_7.pkl"), "wb") as file:
with open(os.path.join("joint_loss_curve_hold_len5.pkl"), "wb") as file:
pickle.dump(loss_curve, file)

return loss_curve
Expand Down Expand Up @@ -198,12 +239,14 @@ def _reshape_pred(X):

# Save predictions
with open(
os.path.join(self.results_path, "predictions_mrdgp.pkl"), "wb"
os.path.join(self.results_path, "predictions_mrdgp_hold_len5.pkl"), "wb"
) as file:
pickle.dump(results, file)

# Save inducing points
with open(os.path.join(self.results_path, "inducing_points.pkl"), "wb") as file:
with open(
os.path.join(self.results_path, "inducing_points_hold_len5.pkl"), "wb"
) as file:
pickle.dump(inducing_points, file)

# Print model and inducing points
Expand Down
29 changes: 11 additions & 18 deletions containers/cleanair/gpjax_models/gpjax_models/models/stgp_svgp.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
class STGP_SVGP:
def __init__(
self,
results_path, # Non-default argument moved to the first position
results_path: str, # Ensure this is passed as a string
M: int = 100,
batch_size: int = 100,
num_epochs: int = 10,
Expand Down Expand Up @@ -92,22 +92,18 @@ def get_laqn_svgp(X_laqn, Y_laqn):

def train_laqn(num_epoch, m_laqn):
laqn_natgrad = NatGradTrainer(m_laqn)

for q in range(len(m_laqn.approximate_posterior.approx_posteriors)):
m_laqn.approximate_posterior.approx_posteriors[q]._S_chol.fix()
m_laqn.approximate_posterior.approx_posteriors[q]._m.fix()

joint_grad = GradDescentTrainer(m_laqn, objax.optimizer.Adam)
laqn_grad = GradDescentTrainer(m_laqn, objax.optimizer.Adam)

lc_arr = []
laqn_natgrad.train(1.0, 1)

for i in trange(num_epoch):
lc_i, _ = joint_grad.train(0.01, 1)
lc_i, _ = laqn_grad.train(0.01, 1)
lc_arr.append(lc_i)

laqn_natgrad.train(1.0, 1)

return lc_arr

def predict_laqn_svgp(pred_data, m) -> dict:
Expand Down Expand Up @@ -149,21 +145,15 @@ def pred_wrapper(XS):
pickle.dump(results, file)

# Save inducing points
inducing_points = m.prior[0].sparsity.inducing_locations
with open(
os.path.join(self.results_path, "inducing_points_svgp_laqn__.pkl"), "wb"
os.path.join(self.results_path, "loss_values_svgp_laqn__.pkl"), "wb"
) as file:
pickle.dump(inducing_points, file)
pickle.dump(loss_values, file)
return loss_values


class STGP_SVGP_SAT:
def __init__(
self,
M,
batch_size,
num_epochs,
):
def __init__(self, M, batch_size, num_epochs, results_path):
"""
Initialize the JAX-based Air Quality Gaussian Process Model.
Expand All @@ -175,6 +165,7 @@ def __init__(
self.M = M
self.batch_size = batch_size
self.num_epochs = num_epochs
self.results_path = results_path

def fit(
self, x_sat: np.ndarray, y_sat: np.ndarray, pred_laqn_data, pred_sat_data
Expand Down Expand Up @@ -286,10 +277,12 @@ def sat_pred(XS):
results = predict_laqn_sat_from_sat_only(pred_laqn_data, pred_sat_data, m)

with open(
os.path.join("training_loss_svgp_sat.pkl"),
os.path.join(self.results_path, "training_svgp_sat.pkl"),
"wb",
) as file:
pickle.dump(loss_values, file)
print(results["metrics"])
with open("predictions_svgp.pkl", "wb") as file:
with open(
os.path.join(self.results_path, "predictions_svgp_sat.pkl"), "wb"
) as file:
pickle.dump(results, file)
Loading

0 comments on commit d3da0d7

Please sign in to comment.