-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
100 lines (82 loc) · 2.86 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""Random forest classification model"""
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.utils import class_weight
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from data_processor import load_and_process_data
import pandas as pd
from sklearn.metrics import classification_report
import argparse
import pickle
import math
def train_random_forest_model(data: pd.DataFrame):
training_data = data[data["gameDate"].dt.year < 2020]
testing_data = data[data["gameDate"].dt.year >= 2020]
predictors = [
"home_or_away",
"dayOfWeek",
"daysSinceLastGame",
"goalsAgainst_rolling",
"goalsFor_rolling",
"shotAttemptsFor_rolling",
"penaltiesFor_rolling",
"missedShotsFor_rolling",
"faceOffsWonFor_rolling",
"corsiPercentage_rolling",
"homeAdvantage",
]
value_counts = training_data.value_counts("outcome").array.tolist()
X_train = training_data[predictors]
y_train = training_data["outcome"]
X_test = testing_data[predictors]
y_test = testing_data["outcome"]
estimators = [
(
"rf",
RandomForestClassifier(
random_state=44,
n_estimators=250,
n_jobs=10,
class_weight={0: 0.677, 1: 1},
),
),
(
"histgb",
XGBClassifier(
random_state=44,
learning_rate=0.01,
max_depth=12,
scale_pos_weight=math.sqrt(value_counts[0] / value_counts[1]),
n_jobs=10,
),
),
("svc", LinearSVC(random_state=44, class_weight={0: 0.677, 1: 1})),
]
model = StackingClassifier(
estimators=estimators,
final_estimator=LogisticRegression(
random_state=44, class_weight={0: 0.677, 1: 1}
),
)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(accuracy_score(y_test, preds))
print(classification_report(y_test, preds))
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="NHL Game Predictor")
subparsers = parser.add_subparsers(dest="command")
# Train command
train_parser = subparsers.add_parser("train", help="Train the random forest model")
train_parser.add_argument("data_file", type=str, help="Path to the data file")
train_parser.add_argument(
"model_file", type=str, help="Path to where you want to save the model file"
)
args = parser.parse_args()
if args.command == "train":
data = load_and_process_data(args.data_file)
model = train_random_forest_model(data)
with open(args.model_file, "wb") as f:
pickle.dump(model, f)