Skip to content

Commit

Permalink
adding prapare traffic data for models
Browse files Browse the repository at this point in the history
  • Loading branch information
Sueda Ciftci committed Jul 9, 2024
1 parent 0913902 commit 602f282
Show file tree
Hide file tree
Showing 2 changed files with 271 additions and 11 deletions.
192 changes: 191 additions & 1 deletion containers/cleanair/gpjax_models/gpjax_models/data/setup_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ def get_X(df):
)


def get_X_trf(df):
# return np.array(df[['epoch', 'lat', 'lon', 'value_100_total_a_road_length', 'value_100_total_a_road_primary_length', 'value_100_flat', 'value_100_max_canyon_ratio']])
return np.array(
df[["epoch", "lat", "lon", "value_200_total_a_road_primary_length", "traffic"]]
)


def get_Y(df):
return np.array(df["NO2"])[:, None]

Expand Down Expand Up @@ -47,6 +54,20 @@ def process_sat_data(train_data_sat):
return X_sat, Y_sat


def process_sat_trf_data(train_data_sat):
"""process satellite to group by box id"""
train_data_sat = train_data_sat.sort_values(by=["box_id", "lat", "lon"])

sat_gb = train_data_sat.groupby(["box_id", "epoch"])
sat_arr = [sat_gb.get_group(i) for i in sat_gb.groups]
X_sat = np.array([get_X_trf(df_i) for df_i in sat_arr])
Y_sat = np.array([get_Y_sat(df_i) for df_i in sat_arr])

Y_sat = Y_sat[..., 0]

return X_sat, Y_sat


def process_data_test(df):
return get_X_norm(df)

Expand All @@ -55,6 +76,10 @@ def process_data(df):
return get_X(df), get_Y(df)


def process_data_trf(df):
return get_X_trf(df), get_Y(df)


def clean_data(x_array, y_array):
"""Remove nans and missing data for use in GPflow
Expand Down Expand Up @@ -174,6 +199,171 @@ def generate_data(train_data, test_data):
"test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}},
}

with open("raw_data.pkl", "wb") as file:
with open("raw_data_svgp_sat_without_trf.pkl", "wb") as file:
pickle.dump(meta_dict, file)
return train_dict, test_dict


def generate_data_trf(train_data, test_data):

# Load dataframes
train_laqn_df = train_data["laqn"]
train_sat_df = train_data["satellite"]
test_laqn_df = test_data["laqn"]
test_hexgrid_df = test_data["hexgrid"]

# Extract X and Y

# collect training arrays
train_laqn_X, train_laqn_Y = process_data_trf(train_laqn_df)
train_sat_X, train_sat_Y = process_sat_trf_data(train_sat_df)

# collect training arrays -- no Y available for testing data
test_laqn_X = get_X_trf(test_laqn_df)
test_hexgrid_X = get_X_trf(test_hexgrid_df)

# Remove NaN data
train_laqn_X, train_laqn_Y = clean_data(train_laqn_X, train_laqn_Y)
train_sat_X, train_sat_Y = clean_data(train_sat_X, train_sat_Y)

# Normalize X, laqn_X is used as the reference
train_laqn_X_norm = norm_X(train_laqn_X, train_laqn_X)
train_sat_X_norm_list = [
norm_X(train_sat_X[i], train_laqn_X) for i in range(train_sat_X.shape[0])
]
train_sat_X_norm = np.array(train_sat_X_norm_list)

test_laqn_X_norm = norm_X(test_laqn_X, train_laqn_X)
test_hexgrid_X_norm = norm_X(test_hexgrid_X, train_laqn_X)

# check shapes
print("======")
print(
f"LAQN train: {train_laqn_X_norm.shape}, {train_laqn_X.shape}, {train_laqn_Y.shape}"
)
print(
f"SAT train: {train_sat_X_norm.shape}, {train_sat_X.shape}, {train_sat_Y.shape}"
)
print(f"LAQN test: {test_laqn_X_norm.shape}, {test_laqn_X.shape}, -")
print(f"HEXGRID test: {test_hexgrid_X_norm.shape}, {test_hexgrid_X.shape}, -")
print("======")

# save data
train_dict = {
"laqn": {"X": train_laqn_X_norm, "Y": train_laqn_Y},
"sat": {"X": train_sat_X_norm, "Y": train_sat_Y},
}

test_dict = {
"laqn": {"X": test_laqn_X_norm, "Y": None},
"hexgrid": {"X": test_hexgrid_X_norm, "Y": None},
}

meta_dict = {
"train": {"laqn": {"df": train_laqn_df}, "sat": {"df": train_sat_df}},
"test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}},
}

with open("raw_data_mrdgp_trf.pkl", "wb") as file:
pickle.dump(meta_dict, file)
return train_dict, test_dict


def generate_data_laqn(train_data, test_data):

# Load dataframes
train_laqn_df = train_data["laqn"]
test_laqn_df = test_data["laqn"]
test_hexgrid_df = test_data["hexgrid"]

# Extract X and Y for training data
train_laqn_X, train_laqn_Y = process_data(train_laqn_df)

# Extract X for test data
test_laqn_X = get_X(test_laqn_df)
test_hexgrid_X = get_X(test_hexgrid_df)

# Remove NaN data
train_laqn_X, train_laqn_Y = clean_data(train_laqn_X, train_laqn_Y)

# Normalize X, using laqn_X as the reference
train_laqn_X_norm = norm_X(train_laqn_X, train_laqn_X)
test_laqn_X_norm = norm_X(test_laqn_X, train_laqn_X)
test_hexgrid_X_norm = norm_X(test_hexgrid_X, train_laqn_X)

# Check shapes
print("======")
print(
f"LAQN train: {train_laqn_X_norm.shape}, {train_laqn_X.shape}, {train_laqn_Y.shape}"
)
print(f"LAQN test: {test_laqn_X_norm.shape}, {test_laqn_X.shape}, -")
print(f"HEXGRID test: {test_hexgrid_X_norm.shape}, {test_hexgrid_X.shape}, -")
print("======")

# Save data
train_dict = {"laqn": {"X": train_laqn_X_norm, "Y": train_laqn_Y}}

test_dict = {
"laqn": {"X": test_laqn_X_norm, "Y": None},
"hexgrid": {"X": test_hexgrid_X_norm, "Y": None},
}

meta_dict = {
"train": {"laqn": {"df": train_laqn_df}},
"test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}},
}

with open("raw_data_svgp.pkl", "wb") as file:
pickle.dump(meta_dict, file)

return train_dict, test_dict


def generate_data_trf_laqn(train_data, test_data):

# Load dataframes
train_laqn_df = train_data["laqn"]
test_laqn_df = test_data["laqn"]
test_hexgrid_df = test_data["hexgrid"]

# Extract X and Y for training data
train_laqn_X, train_laqn_Y = process_data_trf(train_laqn_df)

# Extract X for test data
test_laqn_X = get_X_trf(test_laqn_df)
test_hexgrid_X = get_X_trf(test_hexgrid_df)

# Remove NaN data
train_laqn_X, train_laqn_Y = clean_data(train_laqn_X, train_laqn_Y)

# Normalize X, using laqn_X as the reference
train_laqn_X_norm = norm_X(train_laqn_X, train_laqn_X)
test_laqn_X_norm = norm_X(test_laqn_X, train_laqn_X)
test_hexgrid_X_norm = norm_X(test_hexgrid_X, train_laqn_X)

# Check shapes
print("======")
print(
f"LAQN train: {train_laqn_X_norm.shape}, {train_laqn_X.shape}, {train_laqn_Y.shape}"
)
print(f"LAQN test: {test_laqn_X_norm.shape}, {test_laqn_X.shape}, -")
print(f"HEXGRID test: {test_hexgrid_X_norm.shape}, {test_hexgrid_X.shape}, -")
print("======")

# Save data
train_dict = {"laqn": {"X": train_laqn_X_norm, "Y": train_laqn_Y}}

test_dict = {
"laqn": {"X": test_laqn_X_norm, "Y": None},
"hexgrid": {"X": test_hexgrid_X_norm, "Y": None},
}

meta_dict = {
"train": {"laqn": {"df": train_laqn_df}},
"test": {"laqn": {"df": test_laqn_df}, "hexgrid": {"df": test_hexgrid_df}},
}

with open("raw_data_mrdgp_trf.pkl", "wb") as file:
pickle.dump(meta_dict, file)

return train_dict, test_dict
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ...models.svgp import SVGP
from ...models.stgp_svgp import STGP_SVGP_SAT, STGP_SVGP
from ...models.stgp_mrdgp import STGP_MRDGP
from ...data.setup_data import generate_data
from ...data.setup_data import generate_data, generate_data_trf, generate_data_laqn

app = typer.Typer(help="SVGP model fitting")
train_file_path = "datasets/aq_data.pkl"
Expand Down Expand Up @@ -75,9 +75,9 @@ def svgp(
@app.command()
def train_svgp_sat(
root_dir: str,
M: int = 300,
M: int = 500,
batch_size: int = 200,
num_epochs: int = 1500,
num_epochs: int = 2500,
):
"""
Train the SVGP_GPF2 model on the given training data.
Expand All @@ -88,9 +88,8 @@ def train_svgp_sat(
batch_size (int): Batch size for training.
num_epochs (int): Number of training epochs.
"""
model = STGP_SVGP_SAT(M, batch_size, num_epochs)
model = STGP_SVGP_SAT(M, batch_size, num_epochs, random_seed=42)

# Load training data
typer.echo("Loading training data!")
# Iterate over the directories and subdirectories
for dirpath, dirnames, filenames in os.walk(root_dir):
Expand Down Expand Up @@ -145,11 +144,10 @@ def train_svgp_laqn(
root_dir: str,
M: int = 500,
batch_size: int = 200,
num_epochs: int = 2500,
num_epochs: int = 1000,
):
"""
Train the SVGP_GPF2 model on the given training data.
Args:
train_file_path (str): Path to the training data pickle file.
M (int): Number of inducing variables.
Expand Down Expand Up @@ -178,7 +176,7 @@ def train_svgp_laqn(
with open(file_path, "rb") as file:
test_data_dict = pickle.load(file)

train_dict, test_dict = generate_data(train_data, test_data_dict)
train_dict, test_dict = generate_data_laqn(train_data, test_data_dict)
x_laqn = train_dict["laqn"]["X"]
y_laqn = train_dict["laqn"]["Y"]

Expand Down Expand Up @@ -214,8 +212,8 @@ def train_mrdgp(
root_dir: str,
M: Optional[int] = 500,
batch_size: Optional[int] = 200,
num_epochs: Optional[int] = 2500,
pretrain_epochs: Optional[int] = 2500,
num_epochs: Optional[int] = 100,
pretrain_epochs: Optional[int] = 100,
):
"""
Train the SVGP_GPF2 model on the given training data.
Expand Down Expand Up @@ -277,3 +275,75 @@ def train_mrdgp(
}
model.fit(x_sat, y_sat, x_laqn, y_laqn, pred_laqn_data, pred_sat_data)
typer.echo("Training complete!")


@app.command()
def train_mrdgp_trf(
root_dir: str,
M: Optional[int] = 500,
batch_size: Optional[int] = 200,
num_epochs: Optional[int] = 100,
pretrain_epochs: Optional[int] = 100,
):
"""
Train the SVGP_GPF2 model on the given training data.
Args:
train_file_path (str): Path to the training data pickle file.
M (int): Number of inducing variables.
batch_size (int): Batch size for training.
num_epochs (int): Number of training epochs.
"""

model = STGP_MRDGP(
M, batch_size, num_epochs, pretrain_epochs, root_dir, random_seed=42
)
# Load training data
typer.echo("Loading training data!")
# Iterate over the directories and subdirectories
for dirpath, _, filenames in os.walk(root_dir):
# Check if 'training_dataset.pkl' exists in the current directory
if "training_dataset.pkl" in filenames:
# If found, load the data
file_path = os.path.join(dirpath, "training_dataset.pkl")
with open(file_path, "rb") as file:
train_data = pickle.load(file)

typer.echo("Loading testing data!")
for dirpath, _, filenames in os.walk(root_dir):
# Check if 'training_dataset.pkl' exists in the current directory
if "test_dataset.pkl" in filenames:
# If found, load the data
file_path = os.path.join(dirpath, "test_dataset.pkl")
with open(file_path, "rb") as file:
test_data_dict = pickle.load(file)

train_dict, test_dict = generate_data_trf(train_data, test_data_dict)
x_laqn = train_dict["laqn"]["X"]
y_laqn = train_dict["laqn"]["Y"]
x_sat = train_dict["sat"]["X"]
y_sat = train_dict["sat"]["Y"]

pred_sat_data = {
"sat": {
"X": train_dict["sat"]["X"],
"Y": train_dict["sat"]["Y"],
},
}

pred_laqn_data = {
"hexgrid": {
"X": test_dict["hexgrid"]["X"],
"Y": None,
},
"test_laqn": {
"X": test_dict["laqn"]["X"],
"Y": None,
},
"train_laqn": {
"X": train_dict["laqn"]["X"],
"Y": train_dict["laqn"]["Y"],
},
}
model.fit(x_sat, y_sat, x_laqn, y_laqn, pred_laqn_data, pred_sat_data)
typer.echo("Training complete!")

0 comments on commit 602f282

Please sign in to comment.