-
Notifications
You must be signed in to change notification settings - Fork 1
/
train.py
64 lines (59 loc) · 1.44 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import supervised
from sklearn import tree, ensemble, neural_network
from sklearn.externals import joblib
df = pd.read_csv("dataset.csv")
#df = df.iloc[0:1000,:].reset_index(drop=True)
# Features to train on
subset = [
"secondsToArrival",
"distance",
"layover",
#"isDeparture",
#"classMode",
#"morningRush",
#"eveningRush",
"minutesIntoDay",
#"kmperhr",
#"dayOfWeek",
#"pressure"
]
'''
"kmperhr",
"numBuses",
"layover",
"isDeparture",
"secondsToArrival",
"temperature",
"pressure",
"humidity",
"visibility",
"wind",
"cloudCoverage",
"dayOfWeek",
"hour",
"month",
"minutesIntoDay",
"isWeekend",
"distance",
"cluster"
'''
# Add onehot features
subset.extend([x for x in df.columns if x.startswith("is_")])
subset = subset[:-8] # Delete weather variables
# Add class
subset.append("actualSecondsToArrival")
print(subset)
print(len(subset))
df =df[subset]
config = {
"do_pca": False,
"pca_only": False,
"kbest": "all",
}
#df.to_csv("dataset_clean.csv", index=False)
learner1 = tree.DecisionTreeRegressor(criterion="mse")# , min_samples_split=500)
learner = ensemble.BaggingRegressor(base_estimator=learner1, max_samples=.03, n_estimators=150)
model = supervised.train_test_split(df, learner, config, percent_train=.80)
joblib.dump(model, 'longnight.pkl')
supervised.rolling_kfold(df, learner, config, partitions=10, window=5)