Skip to content

Commit

Permalink
data_setup and mrdgp updates(Sulis)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sueda Ciftci committed May 20, 2024
1 parent b7dc060 commit e99e887
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 193 deletions.
226 changes: 117 additions & 109 deletions containers/cleanair/gpjax_models/gpjax_models/data/setup_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import pickle
import numpy as np
import pandas as pd
from pathlib import Path

from .normalise import normalise, space_norm, time_norm
from stdata.ops import ensure_continuous_timeseries
from stdata.utils import datetime_to_epoch


def get_data_file_names(fold):
Expand All @@ -15,11 +14,10 @@ def get_data_file_names(fold):


def get_X(df):
return np.array(df[["epoch", "lat", "lon", "value_200_park"]])


def get_X_norm(df):
return np.array(df[["epoch_norm", "lat_norm", "lon_norm", "value_200_park_norm"]])
# return np.array(df[['epoch', 'lat', 'lon', 'value_100_total_a_road_length', 'value_100_total_a_road_primary_length', 'value_100_flat', 'value_100_max_canyon_ratio']])
return np.array(
df[["epoch", "lat", "lon", "value_200_total_a_road_primary_length"]]
)


def get_Y(df):
Expand All @@ -31,27 +29,6 @@ def get_Y_sat(df):
return np.array([df["NO2"].iloc[0]])[:, None]


def get_Y_norm(df):
return np.array(df["NO2"])[:, None]


def get_Y_sat_norm(df):
# Sat has one value per box_id, so we only need to return the first one
return np.array([df["NO2"].iloc[0]])[:, None]


def process_data(df):
return get_X(df), get_Y(df)


def process_data_test(df):
return get_X_norm(df)


def process_data_norm(df):
return get_X_norm(df), get_Y_norm(df)


def process_sat_data(train_data_sat):
"""process satellite to group by box id"""
train_data_sat = train_data_sat.sort_values(by=["box_id", "lat", "lon"])
Expand All @@ -66,125 +43,156 @@ def process_sat_data(train_data_sat):
return X_sat, Y_sat


def process_data(df):
return get_X(df), get_Y(df)


def clean_data(x_array, y_array):
"""Remove nans and missing data for use in GPflow
Args:
x_array: N x D numpy array,
y_array: N x 1 numpy array
Returns:
x_array: Feature array cleaned of NaNs.
y_array: Target array cleaned of NaNs
"""
idx = ~np.isnan(y_array[:, 0])
x_array = x_array[idx, :]
y_array = y_array[idx, :]

idx = np.isnan(x_array[:, :])
idx = [not (True in row) for row in idx]
x_array = x_array[idx, :]
y_array = y_array[idx, :]

return x_array, y_array


def time_norm(X, wrt_to_X):
"""units will now be in days"""
return (X - np.min(wrt_to_X)) / (60 * 60 * 24)


def normalise(X, wrt_X):
return (X - np.mean(wrt_X, axis=0)) / np.std(wrt_X, axis=0)


def space_norm(X, wrt_X):
df_norm = normalise(X[:, 1:3], wrt_X[:, 1:3])
return df_norm[:, 0], df_norm[:, 1]


def norm_X(X: np.ndarray, wrt_X: np.ndarray) -> np.ndarray:
def normalise(X, wrt_X):
return (X - np.mean(wrt_X, axis=0)) / np.std(wrt_X, axis=0)


def norm_X(X, wrt_X):
"""
First column is epoch, second is lat, third is lon, the rest are covariates
Epochs:
convert units to days, and shift to start at zero
lat/lon:
convert units to kilometers, and shift to start at zero
z-standardised
Covariates:
z-standarised
"""

X_time = time_norm(X[:, 0], wrt_X[:, 0])
X_time = time_norm(X[..., 0], wrt_X[..., 0])
X_eastings, X_northings = space_norm(X, wrt_X)

X_covariates = normalise(X[:, 3:], wrt_X[:, 3:])

X_covariates = normalise(X[..., 3:], wrt_X[..., 3:])
X_norm = np.hstack(
[X_time[:, None], X_eastings[:, None], X_northings[:, None], X_covariates]
)

return X_norm


def load_test_data(fnames, data_root: Path) -> dict:
return pickle.load(open(data_root / fnames["test"], "rb"))


def clean_data(x_array, y_array):
"""Remove nans and missing data for use in GPflow
Args:
x_array: N x D numpy array,
y_array: N x 1 numpy array
Returns:
x_array: Feature array cleaned of NaNs.
y_array: Target array cleaned of NaNs
"""
idx = ~np.isnan(y_array[:, 0])
x_array = x_array[idx, :]
y_array = y_array[idx, :]
def generate_data(train_data, test_data):
train_laqn_df = train_data["laqn"]
train_sat_df = train_data["satellite"]
test_laqn_df = test_data["laqn"]
test_hexgrid_df = test_data["hexgrid"]

idx = np.isnan(x_array[:, :])
idx = [not (True in row) for row in idx]
train_laqn_df["measurement_start_utc"] = pd.to_datetime(
train_laqn_df["measurement_start_utc"]
)
test_laqn_df["measurement_start_utc"] = pd.to_datetime(
test_laqn_df["measurement_start_utc"]
)
test_hexgrid_df["measurement_start_utc"] = pd.to_datetime(
test_hexgrid_df["measurement_start_utc"]
)

x_array = x_array[idx, :]
y_array = y_array[idx, :]
test_hexgrid_sites = test_hexgrid_df.drop_duplicates(subset="point_id")

return x_array, y_array
test_hexgrid_df = ensure_continuous_timeseries(
test_hexgrid_sites,
dt_col="measurement_start_utc",
id_col="point_id",
freq="H",
min_dt=train_laqn_df["measurement_start_utc"].min(),
max_dt=test_laqn_df["measurement_start_utc"].max(),
)
test_hexgrid_df = test_hexgrid_df[["measurement_start_utc", "point_id"]].merge(
test_hexgrid_sites.drop(columns="measurement_start_utc"), on=["point_id"]
)

test_hexgrid_df["epoch"] = datetime_to_epoch(
test_hexgrid_df["measurement_start_utc"]
)

def generate_data(df):
# load cleaned data pickle
# collect training arrays
train_X, train_Y = process_data(df)
# Normalize X, laqn_X is used as the reference
train_X_norm = norm_X(train_X, train_X)
x_train, y_train = clean_data(train_X_norm, train_Y)
train_laqn_X, train_laqn_Y = process_data(train_laqn_df)
train_sat_X, train_sat_Y = process_sat_data(train_sat_df)

test_laqn_X = get_X(test_laqn_df)
test_hexgrid_X = get_X(test_hexgrid_df)
breakpoint()
# Remove NaN data
train_laqn_X, train_laqn_Y = clean_data(train_laqn_X, train_laqn_Y)
train_sat_X, train_sat_Y = clean_data(train_sat_X, train_sat_Y)
test_laqn_X, _ = clean_data(
test_laqn_X, np.zeros((test_laqn_X.shape[0], 1))
) # Test data has no Y
test_hexgrid_X, _ = clean_data(
test_hexgrid_X, np.zeros((test_hexgrid_X.shape[0], 1))
) # Test data has no Y

train_laqn_X_norm = norm_X(train_laqn_X, train_laqn_X)
train_sat_X_norm_list = [
norm_X(train_sat_X[i], train_laqn_X) for i in range(train_sat_X.shape[0])
]
train_sat_X_norm = np.array(train_sat_X_norm_list)

test_laqn_X_norm = norm_X(test_laqn_X, train_laqn_X)
test_hexgrid_X_norm = norm_X(test_hexgrid_X, train_laqn_X)

print("======")
print(
f"LAQN train: {train_laqn_X_norm.shape}, {train_laqn_X.shape}, {train_laqn_Y.shape}"
)
print(
f"SAT train: {train_sat_X_norm.shape}, {train_sat_X.shape}, {train_sat_Y.shape}"
)
print(f"LAQN test: {test_laqn_X_norm.shape}, {test_laqn_X.shape}, -")
print(f"HEXGRID test: {test_hexgrid_X_norm.shape}, {test_hexgrid_X.shape}, -")
print("======")

# Create the train_dict
train_dict = {
"X": x_train,
"Y": y_train,
"laqn": {"X": train_laqn_X_norm, "Y": train_laqn_Y},
"sat": {"X": train_sat_X_norm, "Y": train_sat_Y},
}

return train_dict


def generate_data_norm(df):
# load cleaned data pickle
# collect training arrays
train_X, train_Y = process_data_norm(df)

# Create the train_dict
train_dict = {
"X": train_X,
"Y": train_Y,
test_dict = {
"laqn": {"X": test_laqn_X_norm, "Y": None},
"hexgrid": {"X": test_hexgrid_X_norm, "Y": None},
}

return train_dict


def generate_data_norm_sat(df):
# load cleaned data pickle
# collect training arrays
train_X, train_Y = process_sat_data(df)

# Create the train_dict
train_dict = {
"X": train_X,
"Y": train_Y,
meta_dict = {
"train": {"laqn": {"df": train_laqn_df}, "sat": {"df": train_sat_df}},
"test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}},
}

return train_dict


def generate_data_test(df):
# load cleaned data pickle
# collect training arrays
test_X = process_data_test(df)

# Create the train_dict
test_dict = {"X": test_X, "Y": None}
with open("raw_data_new_datacreation.pkl", "wb") as file:
pickle.dump(meta_dict, file)

return test_dict
breakpoint()
return train_dict, test_dict
27 changes: 18 additions & 9 deletions containers/cleanair/gpjax_models/gpjax_models/models/stgp_mrdgp.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@
from stgp.trainers import NatGradTrainer, GradDescentTrainer
from stgp.transforms import Aggregate, Independent

results_path = "/Users/suedaciftci/projects/clean-air/clean-air-infrastructure/containers/cleanair/gpjax_models/data/mrdgp_production_results"
# Create the directory if it doesn't exist
if not os.path.exists(results_path):
os.makedirs(results_path)


class STGP_MRDGP:
def __init__(
Expand Down Expand Up @@ -100,8 +105,12 @@ def get_laqn_sat(X_sat, Y_sat, X_laqn, Y_laqn):

latent_gp = GP( # prior
sparsity=Z,
kernel=ScaleKernel(DeepRBF(parent=latent_m1, lengthscale=[0.1]))
* RBF(input_dim=3, lengthscales=[0.1, 0.1, 0.1], active_dims=[1, 2, 3]),
kernel=ScaleKernel(DeepRBF(parent=latent_m1, lengthscale=[1]))
* RBF(
input_dim=4,
lengthscales=[0.1, 0.1, 0.1, 0.1],
active_dims=[1, 2, 3, 4],
),
)

prior = Independent([latent_gp])
Expand Down Expand Up @@ -163,7 +172,8 @@ def train_laqn_sat(m, num_epochs):
sat_natgrad.train(0.1, 1)
laqn_natgrad.train(0.1, 1)

with open(os.path.join("joint_lc.pkl"), "wb") as file:
joint_elbo_path = os.path.join(results_path, "joint_lc_1_500ip.pkl")
with open(joint_elbo_path, "wb") as file:
pickle.dump(lc_arr, file)

return lc_arr
Expand Down Expand Up @@ -226,11 +236,10 @@ def _reshape_pred(XS):
results = predict_laqn_sat(pred_laqn_data, pred_sat_data, m)

# Save the loss values to a pickle file
with open(
os.path.join("training_loss_mrdgp.pkl"),
"wb",
) as file:
training_path = os.path.join(results_path, "training_loss_mrdgp_1_500ip.pkl")
with open(training_path, "wb") as file:
pickle.dump(loss_values, file)

with open(os.path.join("predictions_mrdgp.pkl"), "wb") as file:
# All prediction path
prediction_path = os.path.join(results_path, "predictions_mrdgp_1_500ip.pkl")
with open(prediction_path, "wb") as file:
pickle.dump(results, file)
Loading

0 comments on commit e99e887

Please sign in to comment.