from google.colab import drive
drive.mount('/content/drive')
# importing all the required basic modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#!pip install shap
# importing few other required modules
import shap
import xgboost
from sklearn.externals import joblib
clf_load = joblib.load('pima.pickle.dat')
# compute SHAP values
explainer = shap.TreeExplainer(clf_load)
# load JS visualization code to notebook
shap.initjs()
fp_data = pd.read_csv("fp.csv")
fn_data = pd.read_csv("fn.csv")
fp_data.head()
data = fp_data.iloc[:,3:]
data.head()
# compute SHAP values
shap_values = explainer.shap_values(data)
shap.summary_plot(shap_values, data)
shap.summary_plot(shap_values, data, plot_type="bar")
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], data.iloc[0,:])
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, data)
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("mc_url_parts_std_prob", shap_values, data)
# sort the features indexes by their importance in the model
# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))
# make SHAP plots of the three most important features
for i in range(20):
shap.dependence_plot(top_inds[i], shap_values, data)
fn_data.head()
data = fn_data.iloc[:,3:]
data.head()
# compute SHAP values
shap_values = explainer.shap_values(data)
shap.summary_plot(shap_values, data)
shap.summary_plot(shap_values, data, plot_type="bar")
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], data.iloc[0,:])
shap.initjs()
# visualize the training set predictions
#shap.force_plot(explainer.expected_value, shap_values, data)
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("domain_length", shap_values, data)
# sort the features indexes by their importance in the model
# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))
# make SHAP plots of the three most important features
for i in range(20):
shap.dependence_plot(top_inds[i], shap_values, data)