In [1]:
from google.colab import drive
drive.mount('/content/drive')
In [2]:
# importing all the required basic modules

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
In [3]:
#!pip install shap
In [4]:
# importing few other required modules

import shap
import xgboost

from sklearn.externals import joblib
In [5]:
clf_load = joblib.load('pima.pickle.dat')
In [6]:
# compute SHAP values
explainer = shap.TreeExplainer(clf_load)
In [7]:
# load JS visualization code to notebook
shap.initjs()
In [8]:
fp_data = pd.read_csv("fp.csv")
fn_data = pd.read_csv("fn.csv")
In [9]:
fp_data.head()
Out[9]:
Unnamed: 0 url cert_extendedKeyUsage_other cert_extendedKeyUsage_TLS_Web_Client_Authentication cert_keyUsage_other cert_keyUsage_Key_Encipherment cert_keyUsage_Digital_Signature cert_muti_domain_cert cert_wildcard_domain cert_validty_period_days ... edu.cu dp.ua yt com.pa cool police.uk others web mail webmail
0 25378 development-software.goalkeeping-development.com False True False True True False False 90 ... False False False False False False False False False False
1 91985 sitescrack.bid False True False False True True True 190 ... False False False False False False False False False False
2 196067 vinogradnik-dashevskih.com False True False True True False False 90 ... False False False False False False False False False False
3 279980 inkubator-teknologi.com False True False True True False False 90 ... False False False False False False False False False False
4 341944 happybirthdaywishes-images.com False True False True True False False 90 ... False False False False False False False False False False

5 rows × 552 columns

In [10]:
data = fp_data.iloc[:,3:]
In [11]:
data.head()
Out[11]:
cert_extendedKeyUsage_TLS_Web_Client_Authentication cert_keyUsage_other cert_keyUsage_Key_Encipherment cert_keyUsage_Digital_Signature cert_muti_domain_cert cert_wildcard_domain cert_validty_period_days cert_num_domains cert_num_distinct_domains cert_age ... edu.cu dp.ua yt com.pa cool police.uk others web mail webmail
0 True False True True False False 90 1 1 309 ... False False False False False False False False False False
1 True False False True True True 190 83 42 328 ... False False False False False False False False False False
2 True False True True False False 90 2 1 325 ... False False False False False False False False False False
3 True False True True False False 90 6 1 319 ... False False False False False False False False False False
4 True False True True False False 90 3 1 181 ... False False False False False False False False False False

5 rows × 549 columns

In [12]:
# compute SHAP values
shap_values = explainer.shap_values(data)
In [13]:
shap.summary_plot(shap_values, data)
In [14]:
shap.summary_plot(shap_values, data, plot_type="bar")
In [15]:
shap.initjs()

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], data.iloc[0,:])
Out[15]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [16]:
shap.initjs()

# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, data)
Out[16]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [17]:
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("mc_url_parts_std_prob", shap_values, data)
In [18]:
# sort the features indexes by their importance in the model
# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))

# make SHAP plots of the three most important features
for i in range(20):
    shap.dependence_plot(top_inds[i], shap_values, data)
In [19]:
fn_data.head()
Out[19]:
Unnamed: 0 url cert_extendedKeyUsage_other cert_extendedKeyUsage_TLS_Web_Client_Authentication cert_keyUsage_other cert_keyUsage_Key_Encipherment cert_keyUsage_Digital_Signature cert_muti_domain_cert cert_wildcard_domain cert_validty_period_days ... edu.cu dp.ua yt com.pa cool police.uk others web mail webmail
0 938 trycoba.uk False True False True True False False 90 ... False False False False False False False False False False
1 1754 pitchinside.saversites.com False True False True True False False 90 ... False False False False False False False False False False
2 4718 webdisk.personalmasterynow.com False True False True True True False 90 ... False False False False False False False True False False
3 9366 runners-cache-5.gitlab.com False True False True True False False 365 ... False False False False False False False False False False
4 9444 webdisk.techkesho.com False True False True True False False 90 ... False False False False False False False True False False

5 rows × 552 columns

In [20]:
data = fn_data.iloc[:,3:]
In [21]:
data.head()
Out[21]:
cert_extendedKeyUsage_TLS_Web_Client_Authentication cert_keyUsage_other cert_keyUsage_Key_Encipherment cert_keyUsage_Digital_Signature cert_muti_domain_cert cert_wildcard_domain cert_validty_period_days cert_num_domains cert_num_distinct_domains cert_age ... edu.cu dp.ua yt com.pa cool police.uk others web mail webmail
0 True False True True False False 90 2 1 262 ... False False False False False False False False False False
1 True False True True False False 90 15 1 187 ... False False False False False False False False False False
2 True False True True True False 90 8 2 233 ... False False False False False False False True False False
3 True False True True False False 365 2 1 213 ... False False False False False False False False False False
4 True False True True False False 90 6 1 369 ... False False False False False False False True False False

5 rows × 549 columns

In [22]:
# compute SHAP values
shap_values = explainer.shap_values(data)
In [23]:
shap.summary_plot(shap_values, data)
In [24]:
shap.summary_plot(shap_values, data, plot_type="bar")
In [25]:
shap.initjs()

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], data.iloc[0,:])
Out[25]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [26]:
shap.initjs()

# visualize the training set predictions
#shap.force_plot(explainer.expected_value, shap_values, data)
In [30]:
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("domain_length", shap_values, data)
In [28]:
# sort the features indexes by their importance in the model
# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))

# make SHAP plots of the three most important features
for i in range(20):
    shap.dependence_plot(top_inds[i], shap_values, data)
In [ ]: