diff --git a/containers/cleanair/gpjax_models/gpjax_models/data/setup_data.py b/containers/cleanair/gpjax_models/gpjax_models/data/setup_data.py index 58c561452..1f858cda5 100644 --- a/containers/cleanair/gpjax_models/gpjax_models/data/setup_data.py +++ b/containers/cleanair/gpjax_models/gpjax_models/data/setup_data.py @@ -20,6 +20,13 @@ def get_X(df): ) +def get_X_trf(df): + # return np.array(df[['epoch', 'lat', 'lon', 'value_100_total_a_road_length', 'value_100_total_a_road_primary_length', 'value_100_flat', 'value_100_max_canyon_ratio']]) + return np.array( + df[["epoch", "lat", "lon", "value_200_total_a_road_primary_length", "traffic"]] + ) + + def get_Y(df): return np.array(df["NO2"])[:, None] @@ -47,6 +54,20 @@ def process_sat_data(train_data_sat): return X_sat, Y_sat +def process_sat_trf_data(train_data_sat): + """process satellite to group by box id""" + train_data_sat = train_data_sat.sort_values(by=["box_id", "lat", "lon"]) + + sat_gb = train_data_sat.groupby(["box_id", "epoch"]) + sat_arr = [sat_gb.get_group(i) for i in sat_gb.groups] + X_sat = np.array([get_X_trf(df_i) for df_i in sat_arr]) + Y_sat = np.array([get_Y_sat(df_i) for df_i in sat_arr]) + + Y_sat = Y_sat[..., 0] + + return X_sat, Y_sat + + def process_data_test(df): return get_X_norm(df) @@ -55,6 +76,10 @@ def process_data(df): return get_X(df), get_Y(df) +def process_data_trf(df): + return get_X_trf(df), get_Y(df) + + def clean_data(x_array, y_array): """Remove nans and missing data for use in GPflow @@ -174,6 +199,171 @@ def generate_data(train_data, test_data): "test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}}, } - with open("raw_data.pkl", "wb") as file: + with open("raw_data_svgp_sat_without_trf.pkl", "wb") as file: pickle.dump(meta_dict, file) return train_dict, test_dict + + +def generate_data_trf(train_data, test_data): + + # Load dataframes + train_laqn_df = train_data["laqn"] + train_sat_df = train_data["satellite"] + test_laqn_df = test_data["laqn"] + test_hexgrid_df = test_data["hexgrid"] + + # Extract X and Y + + # collect training arrays + train_laqn_X, train_laqn_Y = process_data_trf(train_laqn_df) + train_sat_X, train_sat_Y = process_sat_trf_data(train_sat_df) + + # collect training arrays -- no Y available for testing data + test_laqn_X = get_X_trf(test_laqn_df) + test_hexgrid_X = get_X_trf(test_hexgrid_df) + + # Remove NaN data + train_laqn_X, train_laqn_Y = clean_data(train_laqn_X, train_laqn_Y) + train_sat_X, train_sat_Y = clean_data(train_sat_X, train_sat_Y) + + # Normalize X, laqn_X is used as the reference + train_laqn_X_norm = norm_X(train_laqn_X, train_laqn_X) + train_sat_X_norm_list = [ + norm_X(train_sat_X[i], train_laqn_X) for i in range(train_sat_X.shape[0]) + ] + train_sat_X_norm = np.array(train_sat_X_norm_list) + + test_laqn_X_norm = norm_X(test_laqn_X, train_laqn_X) + test_hexgrid_X_norm = norm_X(test_hexgrid_X, train_laqn_X) + + # check shapes + print("======") + print( + f"LAQN train: {train_laqn_X_norm.shape}, {train_laqn_X.shape}, {train_laqn_Y.shape}" + ) + print( + f"SAT train: {train_sat_X_norm.shape}, {train_sat_X.shape}, {train_sat_Y.shape}" + ) + print(f"LAQN test: {test_laqn_X_norm.shape}, {test_laqn_X.shape}, -") + print(f"HEXGRID test: {test_hexgrid_X_norm.shape}, {test_hexgrid_X.shape}, -") + print("======") + + # save data + train_dict = { + "laqn": {"X": train_laqn_X_norm, "Y": train_laqn_Y}, + "sat": {"X": train_sat_X_norm, "Y": train_sat_Y}, + } + + test_dict = { + "laqn": {"X": test_laqn_X_norm, "Y": None}, + "hexgrid": {"X": test_hexgrid_X_norm, "Y": None}, + } + + meta_dict = { + "train": {"laqn": {"df": train_laqn_df}, "sat": {"df": train_sat_df}}, + "test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}}, + } + + with open("raw_data_mrdgp_trf.pkl", "wb") as file: + pickle.dump(meta_dict, file) + return train_dict, test_dict + + +def generate_data_laqn(train_data, test_data): + + # Load dataframes + train_laqn_df = train_data["laqn"] + test_laqn_df = test_data["laqn"] + test_hexgrid_df = test_data["hexgrid"] + + # Extract X and Y for training data + train_laqn_X, train_laqn_Y = process_data(train_laqn_df) + + # Extract X for test data + test_laqn_X = get_X(test_laqn_df) + test_hexgrid_X = get_X(test_hexgrid_df) + + # Remove NaN data + train_laqn_X, train_laqn_Y = clean_data(train_laqn_X, train_laqn_Y) + + # Normalize X, using laqn_X as the reference + train_laqn_X_norm = norm_X(train_laqn_X, train_laqn_X) + test_laqn_X_norm = norm_X(test_laqn_X, train_laqn_X) + test_hexgrid_X_norm = norm_X(test_hexgrid_X, train_laqn_X) + + # Check shapes + print("======") + print( + f"LAQN train: {train_laqn_X_norm.shape}, {train_laqn_X.shape}, {train_laqn_Y.shape}" + ) + print(f"LAQN test: {test_laqn_X_norm.shape}, {test_laqn_X.shape}, -") + print(f"HEXGRID test: {test_hexgrid_X_norm.shape}, {test_hexgrid_X.shape}, -") + print("======") + + # Save data + train_dict = {"laqn": {"X": train_laqn_X_norm, "Y": train_laqn_Y}} + + test_dict = { + "laqn": {"X": test_laqn_X_norm, "Y": None}, + "hexgrid": {"X": test_hexgrid_X_norm, "Y": None}, + } + + meta_dict = { + "train": {"laqn": {"df": train_laqn_df}}, + "test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}}, + } + + with open("raw_data_svgp.pkl", "wb") as file: + pickle.dump(meta_dict, file) + + return train_dict, test_dict + + +def generate_data_trf_laqn(train_data, test_data): + + # Load dataframes + train_laqn_df = train_data["laqn"] + test_laqn_df = test_data["laqn"] + test_hexgrid_df = test_data["hexgrid"] + + # Extract X and Y for training data + train_laqn_X, train_laqn_Y = process_data_trf(train_laqn_df) + + # Extract X for test data + test_laqn_X = get_X_trf(test_laqn_df) + test_hexgrid_X = get_X_trf(test_hexgrid_df) + + # Remove NaN data + train_laqn_X, train_laqn_Y = clean_data(train_laqn_X, train_laqn_Y) + + # Normalize X, using laqn_X as the reference + train_laqn_X_norm = norm_X(train_laqn_X, train_laqn_X) + test_laqn_X_norm = norm_X(test_laqn_X, train_laqn_X) + test_hexgrid_X_norm = norm_X(test_hexgrid_X, train_laqn_X) + + # Check shapes + print("======") + print( + f"LAQN train: {train_laqn_X_norm.shape}, {train_laqn_X.shape}, {train_laqn_Y.shape}" + ) + print(f"LAQN test: {test_laqn_X_norm.shape}, {test_laqn_X.shape}, -") + print(f"HEXGRID test: {test_hexgrid_X_norm.shape}, {test_hexgrid_X.shape}, -") + print("======") + + # Save data + train_dict = {"laqn": {"X": train_laqn_X_norm, "Y": train_laqn_Y}} + + test_dict = { + "laqn": {"X": test_laqn_X_norm, "Y": None}, + "hexgrid": {"X": test_hexgrid_X_norm, "Y": None}, + } + + meta_dict = { + "train": {"laqn": {"df": train_laqn_df}}, + "test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}}, + } + + with open("raw_data_mrdgp_trf.pkl", "wb") as file: + pickle.dump(meta_dict, file) + + return train_dict, test_dict diff --git a/containers/cleanair/gpjax_models/gpjax_models/parser/model/fit_cli.py b/containers/cleanair/gpjax_models/gpjax_models/parser/model/fit_cli.py index fdadc657c..4d2a46f1a 100644 --- a/containers/cleanair/gpjax_models/gpjax_models/parser/model/fit_cli.py +++ b/containers/cleanair/gpjax_models/gpjax_models/parser/model/fit_cli.py @@ -17,7 +17,7 @@ from ...models.svgp import SVGP from ...models.stgp_svgp import STGP_SVGP_SAT, STGP_SVGP from ...models.stgp_mrdgp import STGP_MRDGP -from ...data.setup_data import generate_data +from ...data.setup_data import generate_data, generate_data_trf, generate_data_laqn app = typer.Typer(help="SVGP model fitting") train_file_path = "datasets/aq_data.pkl" @@ -75,9 +75,9 @@ def svgp( @app.command() def train_svgp_sat( root_dir: str, - M: int = 300, + M: int = 500, batch_size: int = 200, - num_epochs: int = 1500, + num_epochs: int = 2500, ): """ Train the SVGP_GPF2 model on the given training data. @@ -88,9 +88,8 @@ def train_svgp_sat( batch_size (int): Batch size for training. num_epochs (int): Number of training epochs. """ - model = STGP_SVGP_SAT(M, batch_size, num_epochs) + model = STGP_SVGP_SAT(M, batch_size, num_epochs, random_seed=42) - # Load training data typer.echo("Loading training data!") # Iterate over the directories and subdirectories for dirpath, dirnames, filenames in os.walk(root_dir): @@ -145,11 +144,10 @@ def train_svgp_laqn( root_dir: str, M: int = 500, batch_size: int = 200, - num_epochs: int = 2500, + num_epochs: int = 1000, ): """ Train the SVGP_GPF2 model on the given training data. - Args: train_file_path (str): Path to the training data pickle file. M (int): Number of inducing variables. @@ -178,7 +176,7 @@ def train_svgp_laqn( with open(file_path, "rb") as file: test_data_dict = pickle.load(file) - train_dict, test_dict = generate_data(train_data, test_data_dict) + train_dict, test_dict = generate_data_laqn(train_data, test_data_dict) x_laqn = train_dict["laqn"]["X"] y_laqn = train_dict["laqn"]["Y"] @@ -214,8 +212,8 @@ def train_mrdgp( root_dir: str, M: Optional[int] = 500, batch_size: Optional[int] = 200, - num_epochs: Optional[int] = 2500, - pretrain_epochs: Optional[int] = 2500, + num_epochs: Optional[int] = 100, + pretrain_epochs: Optional[int] = 100, ): """ Train the SVGP_GPF2 model on the given training data. @@ -277,3 +275,75 @@ def train_mrdgp( } model.fit(x_sat, y_sat, x_laqn, y_laqn, pred_laqn_data, pred_sat_data) typer.echo("Training complete!") + + +@app.command() +def train_mrdgp_trf( + root_dir: str, + M: Optional[int] = 500, + batch_size: Optional[int] = 200, + num_epochs: Optional[int] = 100, + pretrain_epochs: Optional[int] = 100, +): + """ + Train the SVGP_GPF2 model on the given training data. + + Args: + train_file_path (str): Path to the training data pickle file. + M (int): Number of inducing variables. + batch_size (int): Batch size for training. + num_epochs (int): Number of training epochs. + """ + + model = STGP_MRDGP( + M, batch_size, num_epochs, pretrain_epochs, root_dir, random_seed=42 + ) + # Load training data + typer.echo("Loading training data!") + # Iterate over the directories and subdirectories + for dirpath, _, filenames in os.walk(root_dir): + # Check if 'training_dataset.pkl' exists in the current directory + if "training_dataset.pkl" in filenames: + # If found, load the data + file_path = os.path.join(dirpath, "training_dataset.pkl") + with open(file_path, "rb") as file: + train_data = pickle.load(file) + + typer.echo("Loading testing data!") + for dirpath, _, filenames in os.walk(root_dir): + # Check if 'training_dataset.pkl' exists in the current directory + if "test_dataset.pkl" in filenames: + # If found, load the data + file_path = os.path.join(dirpath, "test_dataset.pkl") + with open(file_path, "rb") as file: + test_data_dict = pickle.load(file) + + train_dict, test_dict = generate_data_trf(train_data, test_data_dict) + x_laqn = train_dict["laqn"]["X"] + y_laqn = train_dict["laqn"]["Y"] + x_sat = train_dict["sat"]["X"] + y_sat = train_dict["sat"]["Y"] + + pred_sat_data = { + "sat": { + "X": train_dict["sat"]["X"], + "Y": train_dict["sat"]["Y"], + }, + } + + pred_laqn_data = { + "hexgrid": { + "X": test_dict["hexgrid"]["X"], + "Y": None, + }, + "test_laqn": { + "X": test_dict["laqn"]["X"], + "Y": None, + }, + "train_laqn": { + "X": train_dict["laqn"]["X"], + "Y": train_dict["laqn"]["Y"], + }, + } + model.fit(x_sat, y_sat, x_laqn, y_laqn, pred_laqn_data, pred_sat_data) + typer.echo("Training complete!")