Financial Risk Prediction : Weekend Hackathon - 5 Rank-161
This notebook contains the following sections:
Data Understanding
Data Exploration
Model Building
Hyperparamater tuning
Final submission
import numpy as np
import pandas as pd
import matplotlib .pyplot as plt
import seaborn as sns
raw_train = pd .read_csv ('./Financial_Risk_Participants_Data/Train.csv' )
raw_train .head ()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
City
Location_Score
Internal_Audit_Score
External_Audit_Score
Fin_Score
Loss_score
Past_Results
IsUnderRisk
0
2
8.032
14
8
3
6
0
1
1
31
77.730
8
3
3
8
1
0
2
40
59.203
3
12
11
3
0
1
3
12
73.080
4
5
7
6
0
0
4
4
15.666
13
15
6
7
2
1
raw_test = pd .read_csv ('./Financial_Risk_Participants_Data/Test.csv' )
raw_test .head ()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
City
Location_Score
Internal_Audit_Score
External_Audit_Score
Fin_Score
Loss_score
Past_Results
0
41
18.272
13
12
9
7
0
1
17
64.799
6
10
7
4
1
2
31
68.890
3
3
7
8
0
3
3
16.492
15
10
7
4
1
4
9
17.178
7
3
8
3
1
City 0
Location_Score 0
Internal_Audit_Score 0
External_Audit_Score 0
Fin_Score 0
Loss_score 0
Past_Results 0
IsUnderRisk 0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543 entries, 0 to 542
Data columns (total 8 columns):
City 543 non-null int64
Location_Score 543 non-null float64
Internal_Audit_Score 543 non-null int64
External_Audit_Score 543 non-null int64
Fin_Score 543 non-null int64
Loss_score 543 non-null int64
Past_Results 543 non-null int64
IsUnderRisk 543 non-null int64
dtypes: float64(1), int64(7)
memory usage: 34.0 KB
plt .figure (figsize = (12 ,9 ))
sns .heatmap (raw_train .corr (),annot = True )
plt .show ()
discrete = [var for var in raw_train .columns if raw_train [var ].dtype != 'O' and var != 'IsUnderRisk' and raw_train [var ].nunique ()< 10 ]
continuous = [var for var in raw_train .columns if raw_train [var ].dtype != 'O' and var != 'IsUnderRisk' and var not in discrete ]
# categorical
categorical = [var for var in raw_train .columns if raw_train [var ].dtype == 'O' ]
print ('There are {} discrete variables' .format (len (discrete )))
print ('There are {} continuous variables' .format (len (continuous )))
print ('There are {} categorical variables' .format (len (categorical )))
There are 1 discrete variables
There are 6 continuous variables
There are 0 categorical variables
['City',
'Location_Score',
'Internal_Audit_Score',
'External_Audit_Score',
'Fin_Score',
'Loss_score']
plt .figure (figsize = (12 ,9 ))
sns .pairplot (raw_train )
plt .show ()
<Figure size 864x648 with 0 Axes>
def sub_file (filename ,preds ):
sub_df = pd .DataFrame (preds ,columns = ['0' ,'1' ])
sub_df .to_excel (filename ,index = False )
print (sub_df .head ())
X = raw_train .drop ('IsUnderRisk' ,axis = 1 )
y = raw_train .IsUnderRisk
from sklearn .linear_model import LogisticRegression
from sklearn .svm import SVC
from sklearn .ensemble import RandomForestClassifier ,GradientBoostingClassifier , AdaBoostClassifier ,VotingClassifier
from sklearn .tree import DecisionTreeClassifier ,plot_tree
from xgboost import XGBClassifier
from sklearn .metrics import confusion_matrix ,accuracy_score ,roc_auc_score ,log_loss ,make_scorer
from sklearn .model_selection import train_test_split , cross_val_score ,GridSearchCV , RandomizedSearchCV
log_loss_scorer = make_scorer (log_loss )
def build_model (model ,X ,y ):
X_train ,X_test ,y_train ,y_test = train_test_split (X ,y ,test_size = 0.2 ,random_state = 1 ,stratify = y )
model .fit (X_train ,y_train )
y_pred = model .predict (X_test )
y_train_pred = model .predict (X_train )
print ("Training Log Loss: : " ,log_loss (y_train ,y_train_pred ))
print ("Testing Log Loss: : " ,log_loss (y_test ,y_pred ))
print ("Training accuracy: : " ,accuracy_score (y_train ,y_train_pred ))
print ("Testing accuracy: : " ,accuracy_score (y_test ,y_pred ))
print (confusion_matrix (y_test ,y_pred ))
cv_results = cross_val_score (model ,X ,y ,cv = 10 ,n_jobs = - 1 ,scoring = log_loss_scorer )
print ("Cross validation Log Loss:" , cv_results .mean ())
from sklearn .preprocessing import StandardScaler
std = StandardScaler ()
X_std = std .fit_transform (X )
lr = LogisticRegression ()
build_model (lr ,X_std ,y )
Training Log Loss: : 6.36666658039777
Testing Log Loss: : 4.4362171294801
Training accuracy: : 0.815668202764977
Testing accuracy: : 0.8715596330275229
[[35 6]
[ 8 60]]
Cross validation Log Loss: 6.67873178982795
lr_params = {
"penalty" :['l1' ,'l2' ],
"C" :[0.001 ,0.01 ,0.1 ,1 ,10 ,100 ]
}
lr_grid = GridSearchCV (estimator = lr ,
param_grid = lr_params ,
cv = 10 ,verbose = 2 ,n_jobs = - 1 )
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 6.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 7.5s finished
GridSearchCV(cv=10, error_score=nan,
estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True,
intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto',
n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs',
tol=0.0001, verbose=0,
warm_start=False),
iid='deprecated', n_jobs=-1,
param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=2)
print ("Grid_search best score:" ,lr_grid .best_score_ )
Grid_search best score: 0.8067676767676769
{'C': 0.01, 'penalty': 'l2'}
build_model (lr_grid .best_estimator_ ,X ,y )
Training Log Loss: : 6.685000001537178
Testing Log Loss: : 5.069963481656719
Training accuracy: : 0.8064516129032258
Testing accuracy: : 0.8532110091743119
[[34 7]
[ 9 59]]
Cross validation Log Loss: 6.486852633294605
std = StandardScaler ()
X_std = std .fit_transform (X )
svc = SVC (probability = True )
build_model (svc ,X_std ,y )
Training Log Loss: : 4.377059668396568
Testing Log Loss: : 4.4361951222113785
Training accuracy: : 0.8732718894009217
Testing accuracy: : 0.8715596330275229
[[38 3]
[11 57]]
Cross validation Log Loss: 5.090145364653574
svc_params = {
"C" :[0.001 ,0.01 ,0.1 ,1 ,10 ,100 ,1000 ],
"gamma" :['auto' ,'scale' ],
"class_weight" :['balanced' ,None ]
}
svc_grid = GridSearchCV (estimator = svc ,
param_grid = svc_params ,
cv = 10 ,verbose = 2 ,n_jobs = - 1 )
Fitting 10 folds for each of 28 candidates, totalling 280 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 58 tasks | elapsed: 2.0s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 11.4s finished
GridSearchCV(cv=10, error_score=nan,
estimator=SVC(C=1.0, break_ties=False, cache_size=200,
class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='scale', kernel='rbf', max_iter=-1,
probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='deprecated', n_jobs=-1,
param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'class_weight': ['balanced', None],
'gamma': ['auto', 'scale']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=2)
{'C': 10, 'class_weight': 'balanced', 'gamma': 'auto'}
build_model (svc_grid .best_estimator_ ,X_std ,y )
Training Log Loss: : 3.103722299058151
Testing Log Loss: : 4.436187786455138
Training accuracy: : 0.9101382488479263
Testing accuracy: : 0.8715596330275229
[[39 2]
[12 56]]
Cross validation Log Loss: 4.709853740451932
3. RandomForestClassifier
rf = RandomForestClassifier ()
build_model (rf ,X ,y )
Training Log Loss: : 9.992007221626413e-16
Testing Log Loss: : 3.4855792618245687
Training accuracy: : 1.0
Testing accuracy: : 0.8990825688073395
[[39 2]
[ 9 59]]
Cross validation Log Loss: 5.2785444787755615
rf_params = {
"n_estimators" :[120 ,300 ,500 ,800 ,1200 ],
"max_depth" :[5 ,8 ,15 ,25 ,30 ,None ],
"min_samples_split" :[1 ,2 ,5 ,10 ,15 ,None ],
"min_samples_leaf" :[1 ,2 ,5 ,10 ],
"max_features" :["log2" ,"sqrt" ,None ]
}
rf_random = RandomizedSearchCV (estimator = rf ,
param_distributions = rf_params ,
n_iter = 50 ,cv = 10 ,verbose = 2 ,random_state = 4 ,n_jobs = - 1 )
Fitting 10 folds for each of 50 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 51.3s
[Parallel(n_jobs=-1)]: Done 154 tasks | elapsed: 2.3min
[Parallel(n_jobs=-1)]: Done 357 tasks | elapsed: 4.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 5.8min finished
RandomizedSearchCV(cv=10, error_score=nan,
estimator=RandomForestClassifier(bootstrap=True,
ccp_alpha=0.0,
class_weight=None,
criterion='gini',
max_depth=None,
max_features='auto',
max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_job...
warm_start=False),
iid='deprecated', n_iter=50, n_jobs=-1,
param_distributions={'max_depth': [5, 8, 15, 25, 30, None],
'max_features': ['log2', 'sqrt', None],
'min_samples_leaf': [1, 2, 5, 10],
'min_samples_split': [1, 2, 5, 10, 15,
None],
'n_estimators': [120, 300, 500, 800,
1200]},
pre_dispatch='2*n_jobs', random_state=4, refit=True,
return_train_score=False, scoring=None, verbose=2)
{'n_estimators': 500,
'min_samples_split': 2,
'min_samples_leaf': 2,
'max_features': 'sqrt',
'max_depth': 5}
build_model (rf_random .best_estimator_ ,X ,y )
Training Log Loss: : 3.342469601327119
Testing Log Loss: : 4.1193256140011885
Training accuracy: : 0.9032258064516129
Testing accuracy: : 0.8807339449541285
[[38 3]
[10 58]]
Cross validation Log Loss: 4.831973690775467
plt .barh (y = X .columns ,width = rf_random .best_estimator_ .feature_importances_ * 100 )
<BarContainer object of 7 artists>
4. GradientBoosting Classifier
gb = GradientBoostingClassifier ()
build_model (gb ,X ,y )
Training Log Loss: : 1.3529087490857206
Testing Log Loss: : 3.4855865975808085
Training accuracy: : 0.9608294930875576
Testing accuracy: : 0.8990825688073395
[[38 3]
[ 8 60]]
Cross validation Log Loss: 4.516827815637886
{'ccp_alpha': 0.0,
'criterion': 'friedman_mse',
'init': None,
'learning_rate': 0.1,
'loss': 'deviance',
'max_depth': 3,
'max_features': None,
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'min_impurity_split': None,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_iter_no_change': None,
'presort': 'deprecated',
'random_state': None,
'subsample': 1.0,
'tol': 0.0001,
'validation_fraction': 0.1,
'verbose': 0,
'warm_start': False}
gb_params = {
"learning_rate" :[0.01 ,0.015 ,0.025 ,0.05 ,0.1 ],
"max_depth" :[3 ,5 ,7 ,9 ,12 ,14 ],
"subsample" :[0.6 ,0.7 ,0.8 ,0.9 ,1.0 ],
"min_samples_split" :[1 ,2 ,5 ,10 ,15 ,None ],
"min_samples_leaf" :[1 ,2 ,5 ,10 ],
"max_features" :["log2" ,"sqrt" ,None ],
"n_estimators" :[120 ,300 ,500 ,800 ,1200 ],
}
gb_random = RandomizedSearchCV (estimator = gb ,
param_distributions = gb_params ,
n_iter = 25 ,cv = 10 ,verbose = 2 ,random_state = 4 ,n_jobs = - 1 )
Fitting 10 folds for each of 25 candidates, totalling 250 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 33 tasks | elapsed: 35.9s
[Parallel(n_jobs=-1)]: Done 154 tasks | elapsed: 2.6min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 3.3min finished
RandomizedSearchCV(cv=10, error_score=nan,
estimator=GradientBoostingClassifier(ccp_alpha=0.0,
criterion='friedman_mse',
init=None,
learning_rate=0.1,
loss='deviance',
max_depth=3,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_it...
param_distributions={'learning_rate': [0.01, 0.015, 0.025,
0.05, 0.1],
'max_depth': [3, 5, 7, 9, 12, 14],
'max_features': ['log2', 'sqrt', None],
'min_samples_leaf': [1, 2, 5, 10],
'min_samples_split': [1, 2, 5, 10, 15,
None],
'n_estimators': [120, 300, 500, 800,
1200],
'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]},
pre_dispatch='2*n_jobs', random_state=4, refit=True,
return_train_score=False, scoring=None, verbose=2)
{'subsample': 0.7,
'n_estimators': 300,
'min_samples_split': 2,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 3,
'learning_rate': 0.1}
build_model (gb_random .best_estimator_ ,X ,y )
Training Log Loss: : 0.15916671056970505
Testing Log Loss: : 4.11934762126991
Training accuracy: : 0.9953917050691244
Testing accuracy: : 0.8807339449541285
[[35 6]
[ 7 61]]
Cross validation Log Loss: 5.087831178555844
vote_clf = VotingClassifier ([("randomForest" ,rf_random .best_estimator_ ),("GradientBoost" ,gb_random .best_estimator_ )])
build_model (vote_clf ,X ,y )
Training Log Loss: : 3.103714929496583
Testing Log Loss: : 4.753057294665327
Training accuracy: : 0.9101382488479263
Testing accuracy: : 0.8623853211009175
[[39 2]
[13 55]]
Cross validation Log Loss: 4.707530750705729
models = ["Logistic" ,"SVM" ,"RandomForest" ,"GradientBoosting" ,"VotingClassifier" ]
cross_vals = [6.67873178982795 ,5.090145364653574 ,5.2785444787755615 ,4.516827815637886 ,4.707530750705729 ]
plt .figure (figsize = (12 ,9 ))
plt .bar (x = models ,height = cross_vals )
plt .title ("Log Loss of cross validation" )
plt .show ()