Skip to content

Commit

Permalink
Improve model
Browse files Browse the repository at this point in the history
  • Loading branch information
Bear-Witness-98 committed Jul 19, 2024
1 parent 415f861 commit a68d6d4
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 74 deletions.
5 changes: 1 addition & 4 deletions challenge/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,6 @@ async def post_predict(flight_data: FlightData) -> dict:
preprocessed_data = model.preprocess(flight_data_df)

# sorts column to feed the model
column_order = model._model.feature_names_in_
preprocessed_data = preprocessed_data[column_order]

pred = model.predict(preprocessed_data)

return {"predict": pred}
Expand All @@ -138,5 +135,5 @@ async def post_predict(flight_data: FlightData) -> dict:
with open("error_logs.txt", "a") as f:
f.write(f"{datetime.now(timezone.utc)}: encounter error {e}")
raise HTTPException(
status_code=500, detail="Internal server error during prediction"
status_code=500, detail="Internal server error during prediction."
)
152 changes: 82 additions & 70 deletions challenge/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,61 +6,91 @@
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

FEATURES_COLS = [
"OPERA_Latin American Wings",
"MES_7",
"MES_10",
"OPERA_Grupo LATAM",
"MES_12",
"TIPOVUELO_I",
"MES_4",
"MES_11",
"OPERA_Sky Airline",
"OPERA_Copa Air",
]

class DelayModel:
FEATURES_COLS = [
"MES_4",
"MES_7",
"MES_10",
"MES_11",
"MES_12",
"OPERA_Copa Air",
"OPERA_Grupo LATAM",
"OPERA_Latin American Wings",
"OPERA_Sky Airline",
"TIPOVUELO_I",
]

THRESHOLD_IN_MINUTES = 15

def get_min_diff(data):
fecha_o = datetime.strptime(data["Fecha-O"], "%Y-%m-%d %H:%M:%S")
fecha_i = datetime.strptime(data["Fecha-I"], "%Y-%m-%d %H:%M:%S")
min_diff = ((fecha_o - fecha_i).total_seconds()) / 60
return min_diff
def __init__(self):
self._model = LogisticRegression()

def _get_min_diff(self, data: pd.Series) -> float:
"""
Auxiliary function to get target.
def get_delay_target(data: pd.DataFrame) -> pd.Series:
data["min_diff"] = data.apply(get_min_diff, axis=1)
threshold_in_minutes = 15
data["delay"] = np.where(data["min_diff"] > threshold_in_minutes, 1, 0)
Args:
data (pd.Series): raw data row.
return data["delay"].to_frame()
Returns:
float: difference between two rows in minutes.
"""
fecha_o = datetime.strptime(data["Fecha-O"], "%Y-%m-%d %H:%M:%S")
fecha_i = datetime.strptime(data["Fecha-I"], "%Y-%m-%d %H:%M:%S")
min_diff = ((fecha_o - fecha_i).total_seconds()) / 60
return min_diff

def _get_delay_target(self, data: pd.DataFrame) -> pd.Series:
"""
Compute and return target to train the model with, from raw data.
def get_features(data: pd.DataFrame) -> pd.DataFrame:
# get the one hot enconding of the columns suggested by the DS
features = pd.concat(
[
pd.get_dummies(data["OPERA"], prefix="OPERA"),
pd.get_dummies(data["TIPOVUELO"], prefix="TIPOVUELO"),
pd.get_dummies(data["MES"], prefix="MES"),
],
axis=1,
)
present_features = list(set(FEATURES_COLS).intersection(set(features.columns)))
missing_features = list(set(FEATURES_COLS).difference(set(features.columns)))
Args:
data (pd.DataFrame): raw data.
features = features[present_features]
Returns:
pd.Series: target to predict.
"""
data["min_diff"] = data.apply(self._get_min_diff, axis=1)
data["delay"] = np.where(data["min_diff"] > self.THRESHOLD_IN_MINUTES, 1, 0)

# fill missing features with 0 due to one-hot encoding of features
for feature in missing_features:
features[feature] = 0
return data["delay"].to_frame()

return features
def _get_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Compute and return input features to feed the model from raw data.
Args:
data (pd.DataFrame): raw_data.
class DelayModel:
def __init__(self):
self._model = LogisticRegression()
Returns:
pd.DataFrame: features with columns in a specific order.
"""
# get the one hot enconding of the columns suggested by the DS
# the existance of these three columns is enforced by the api above this code
features = pd.concat(
[
pd.get_dummies(data["OPERA"], prefix="OPERA"),
pd.get_dummies(data["TIPOVUELO"], prefix="TIPOVUELO"),
pd.get_dummies(data["MES"], prefix="MES"),
],
axis=1,
)
valid_features = list(
set(self.FEATURES_COLS).intersection(set(features.columns))
)
missing_features = list(
set(self.FEATURES_COLS).difference(set(features.columns))
)

# get valid features and fill missin with 0 due to one-hot encoding
features = features[valid_features]
features[missing_features] = 0

# return dataframe with sorted columns
return features[self.FEATURES_COLS]

def preprocess(
self, data: pd.DataFrame, target_column: Optional[str] = None
Expand All @@ -78,20 +108,20 @@ def preprocess(
pd.DataFrame: features.
"""
# retrieve features from the data
x = get_features(data)
x = self._get_features(data)

# return different sets, depending on the target
if target_column is None:
return x
elif target_column == "delay":
y = get_delay_target(data)
y = self._get_delay_target(data)
return (x, y)
else:
raise NotImplementedError("Only implemented 'delay' as target column")

def fit(self, features: pd.DataFrame, target: pd.DataFrame) -> None:
"""
Fit model with preprocessed data.
Fit model with data preprocessed by this class.
Args:
features (pd.DataFrame): preprocessed data.
Expand Down Expand Up @@ -130,7 +160,7 @@ def load_model(self, path: str):

def predict(self, features: pd.DataFrame) -> List[int]:
"""
Predict delays for new flights.
Predict delays for new flights on data preprocessed by this class.
Args:
features (pd.DataFrame): preprocessed data.
Expand All @@ -146,38 +176,20 @@ def predict(self, features: pd.DataFrame) -> List[int]:
return pred


if __name__ == "__main__":
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.model_selection import train_test_split
def main():
# perform a training of the model with all available data for production deployment

# perform a training of the model for production deployment
# get data and initial model
model = DelayModel()
data = pd.read_csv(filepath_or_buffer="data/data.csv")

# preprocess data and fit
features, target = model.preprocess(data=data, target_column="delay")

_, features_validation, _, target_validation = train_test_split(
features, target, test_size=0.33, random_state=42
)

model.fit(features=features, target=target)

predicted_target = model.predict(features_validation)

report = classification_report(
target_validation, predicted_target, output_dict=True
)

# save
model.save_model("models")

# re instantiate to override model
model = DelayModel()
model.load_model("models")

predicted_target_load = model.predict(features_validation)

print(
"The difference in prediction is:"
f" {mean_squared_error(predicted_target, predicted_target_load)}"
)
if __name__ == "__main__":
main()

0 comments on commit a68d6d4

Please sign in to comment.