This repository has been archived by the owner on Apr 13, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
shap.py
131 lines (98 loc) · 2.97 KB
/
shap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
"""shap.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1lk6Yw-m8BdFxiHYU-cx-UALEeZ1zKSFn
# Setup
"""
pip install shap
# sys
import sys
import warnings
# numpy
import numpy
import numpy as np
from numpy import loadtxt
# pandas
import pandas
import pandas as pd
from pandas.plotting import scatter_matrix
# matplotlib
import matplotlib.pyplot as plt
import matplotlib
# scikit-learn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#XGBoost
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
#shap
import shap
url = 'https://raw.githubusercontent.com/dcmesh/diabetes-ml-models/master/diabetes.csv'
dataset = pd.read_csv(url)
"""# Exploratory Analysis"""
dataset.describe()
dataset.shape
# peek
dataset.head(20)
dataset.isnull().any()
# outcome distribution
print(dataset.groupby('Outcome').size())
# box and whisker plots
dataset.plot(kind='box', subplots=True, figsize=(12,12), layout=(3,3), sharex=False, sharey=False)
plt.subplots_adjust(hspace=0.3, wspace=0.3)
plt.show()
# histograms
dataset.hist(figsize=(12,12))
plt.subplots_adjust(hspace=0.3, wspace=0.3)
plt.show()
# scatter plot matrix
scatter_matrix(dataset, figsize=(18,18))
plt.subplots_adjust(hspace=0.4, wspace=0.4)
plt.show()
"""# XGBoost Model"""
train_data, test_data = train_test_split(dataset, test_size=0.2)
print(len(train_data), 'train examples')
print(len(test_data), 'test examples')
x_train = train_data.iloc[:,:8]
y_train = train_data.iloc[:,-1]
x_test = test_data.iloc[:,:8]
y_test = test_data.iloc[:,-1]
# fit model no training data
model = XGBClassifier()
model.fit(x_train, y_train)
# make predictions for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#XGBoost
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
predictions = xgb.predict(x_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
"""# Shap"""
import shap
# load JS visualization code to notebook
shap.initjs()
# Explain model predictions
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(x_train)
# Visualize prediction explanation
shap.force_plot(explainer.expected_value, shap_values, x_train)
# Shap summary
shap.summary_plot(shap_values, x_train)