shap.py

# -*- coding: utf-8 -*-
"""shap.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1lk6Yw-m8BdFxiHYU-cx-UALEeZ1zKSFn

# Setup
"""

pip install shap

# sys
import sys
import warnings

# numpy
import numpy
import numpy as np
from numpy import loadtxt

# pandas
import pandas
import pandas as pd
from pandas.plotting import scatter_matrix

# matplotlib
import matplotlib.pyplot as plt
import matplotlib

# scikit-learn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

#XGBoost
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

#shap
import shap

url = 'https://raw.githubusercontent.com/dcmesh/diabetes-ml-models/master/diabetes.csv'
dataset = pd.read_csv(url)

"""# Exploratory Analysis"""

dataset.describe()

dataset.shape

# peek
dataset.head(20)

dataset.isnull().any()

# outcome distribution
print(dataset.groupby('Outcome').size())

# box and whisker plots
dataset.plot(kind='box', subplots=True, figsize=(12,12), layout=(3,3), sharex=False, sharey=False)
plt.subplots_adjust(hspace=0.3, wspace=0.3)
plt.show()

# histograms
dataset.hist(figsize=(12,12))
plt.subplots_adjust(hspace=0.3, wspace=0.3)
plt.show()

# scatter plot matrix
scatter_matrix(dataset, figsize=(18,18))
plt.subplots_adjust(hspace=0.4, wspace=0.4)
plt.show()

"""# XGBoost Model"""

train_data, test_data = train_test_split(dataset, test_size=0.2)
print(len(train_data), 'train examples')
print(len(test_data), 'test examples')
x_train = train_data.iloc[:,:8]
y_train = train_data.iloc[:,-1]
x_test = test_data.iloc[:,:8]
y_test = test_data.iloc[:,-1]

# fit model no training data
model = XGBClassifier()
model.fit(x_train, y_train)

# make predictions for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#XGBoost
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
predictions = xgb.predict(x_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

"""# Shap"""

import shap

# load JS visualization code to notebook
shap.initjs()

# Explain model predictions
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(x_train)

# Visualize prediction explanation
shap.force_plot(explainer.expected_value, shap_values, x_train)

# Shap summary
shap.summary_plot(shap_values, x_train)