Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
group_names = [str(i) for i in range(X.shape[1])]
if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"):
group_names = X.columns
X = X.values
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
if round_values:
for i in range(k):
for j in range(X.shape[1]):
ind = np.argmin(np.abs(X[:,j] - kmeans.cluster_centers_[i,j]))
kmeans.cluster_centers_[i,j] = X[ind,j]
return DenseData(kmeans.cluster_centers_, group_names, None, 1.0*np.bincount(kmeans.labels_))
class KernelExplainer(Explainer):
"""Uses the Kernel SHAP method to explain the output of any function.
Kernel SHAP is a method that uses a special weighted linear regression
to compute the importance of each feature. The computed importance values
are Shapley values from game theory and also coefficents from a local linear
regression.
Parameters
----------
model : function or iml.Model
User supplied function that takes a matrix of samples (# samples x # features) and
computes a the output of the model for those samples. The output can be a vector
(# samples) or a matrix (# samples x # model outputs).
data : numpy.array or pandas.DataFrame or shap.common.DenseData or any scipy.sparse matrix
pruned from the gradient backprop process. We then reset the type directly
afterwards back to what it was (an integer type).
"""
reset_input = False
if op_name == "ResourceGather" and inputs[1].dtype == tf.int32:
inputs[1].__dict__["_dtype"] = tf.float32
reset_input = True
out = tf_backprop._record_gradient(op_name, inputs, attrs, results, name)
if reset_input:
inputs[1].__dict__["_dtype"] = tf.int32
return out
class TFDeepExplainer(Explainer):
"""
Using tf.gradients to implement the backgropagation was
inspired by the gradient based implementation approach proposed by Ancona et al, ICLR 2018. Note
that this package does not currently use the reveal-cancel rule for ReLu units proposed in DeepLIFT.
"""
def __init__(self, model, data, session=None, learning_phase_flags=None):
""" An explainer object for a deep model using a given background dataset.
Note that the complexity of the method scales linearly with the number of background data
samples. Passing the entire training dataset as `data` will give very accurate expected
values, but be unreasonably expensive. The variance of the expectation estimates scale by
roughly 1/sqrt(N) for N background data samples. So 100 samples will give a good estimate,
and 1000 samples a very good estimate of the expected values.
Parameters
interpolation between current and background example, smoothing).
Returns
-------
For a models with a single output this returns a tensor of SHAP values with the same shape
as X. For a model with multiple outputs this returns a list of SHAP value tensors, each of
which are the same shape as X. If ranked_outputs is None then this list of tensors matches
the number of model outputs. If ranked_outputs is a positive integer a pair is returned
(shap_values, indexes), where shap_values is a list of tensors with a length of
ranked_outputs, and indexes is a matrix that tells for each sample which output indexes
were chosen as "top".
"""
return self.explainer.shap_values(X, nsamples, ranked_outputs, output_rank_order, rseed)
class _TFGradientExplainer(Explainer):
def __init__(self, model, data, session=None, batch_size=50, local_smoothing=0):
# try and import keras and tensorflow
global tf, keras
if tf is None:
import tensorflow as tf
if LooseVersion(tf.__version__) < LooseVersion("1.4.0"):
warnings.warn("Your TensorFlow version is older than 1.4.0 and not supported.")
if keras is None:
try:
import keras
if LooseVersion(keras.__version__) < LooseVersion("2.1.0"):
warnings.warn("Your Keras version is older than 2.1.0 and not supported.")
except:
pass
record_import_error("pyspark", "PySpark could not be imported!", e)
output_transform_codes = {
"identity": 0,
"logistic": 1,
"logistic_nlogloss": 2,
"squared_loss": 3
}
feature_perturbation_codes = {
"interventional": 0,
"tree_path_dependent": 1,
"global_path_dependent": 2
}
class TreeExplainer(Explainer):
"""Uses Tree SHAP algorithms to explain the output of ensemble tree models.
Tree SHAP is a fast and exact method to estimate SHAP values for tree models and ensembles of trees,
under several different possible assumptions about feature dependence. It depends on fast C++
implementations either inside an externel model package or in the local compiled C extention.
Parameters
----------
model : model object
The tree based machine learning model that we want to explain. XGBoost, LightGBM, CatBoost, Pyspark
and most tree-based scikit-learn models are supported.
data : numpy.array or pandas.DataFrame
The background dataset to use for integrating out features. This argument is optional when
feature_perturbation="tree_path_dependent", since in that case we can use the number of training
samples that went down each tree path as our background dataset (this is recorded in the model object).
import numpy as np
import warnings
from .explainer import Explainer
from distutils.version import LooseVersion
keras = None
tf = None
torch = None
class GradientExplainer(Explainer):
""" Explains a model using expected gradients (an extension of integrated gradients).
Expected gradients an extension of the integrated gradients method (Sundararajan et al. 2017), a
feature attribution method designed for differentiable models based on an extension of Shapley
values to infinite player games (Aumann-Shapley values). Integrated gradients values are a bit
different from SHAP values, and require a single reference value to integrate from. As an adaptation
to make them approximate SHAP values, expected gradients reformulates the integral as an expectation
and combines that expectation with sampling reference values from the background dataset. This leads
to a single combined expectation of gradients that converges to attributions that sum to the
difference between the expected model output and the current output.
"""
def __init__(self, model, data, session=None, batch_size=50, local_smoothing=0):
""" An explainer object for a differentiable model using a given background dataset.
Note that the complexity of the method scales linearly with the number of background data
output_phis.append(phis[0] if not self.multi_input else phis)
if not self.multi_output:
return output_phis[0]
elif ranked_outputs is not None:
return output_phis, model_output_ranks
else:
return output_phis
def run(self, out, model_inputs, X):
feed_dict = dict(zip(model_inputs, X))
if self.keras_phase_placeholder is not None:
feed_dict[self.keras_phase_placeholder] = 0
return self.session.run(out, feed_dict)
class _PyTorchGradientExplainer(Explainer):
def __init__(self, model, data, batch_size=50, local_smoothing=0):
# try and import pytorch
global torch
if torch is None:
import torch
if LooseVersion(torch.__version__) < LooseVersion("0.4"):
warnings.warn("Your PyTorch version is older than 0.4 and not supported.")
# check if we have multiple inputs
self.multi_input = False
if type(data) == list:
self.multi_input = True
if type(data) != list:
data = [data]
import numpy as np
import scipy as sp
import warnings
from tqdm.autonotebook import tqdm
from .explainer import Explainer
class LinearExplainer(Explainer):
""" Computes SHAP values for a linear model, optionally accounting for inter-feature correlations.
This computes the SHAP values for a linear model and can account for the correlations among
the input features. Assuming features are independent leads to interventional SHAP values which
for a linear model are coef[i] * (x[i] - X.mean(0)[i]) for the ith feature. If instead we account
for correlations then we prevent any problems arising from colinearity and share credit among
correlated features. Accounting for correlations can be computationally challenging, but
LinearExplainer uses sampling to estimate a transform that can then be applied to explain
any prediction of the model.
Parameters
----------
model : (coef, intercept) or sklearn.linear_model.*
User supplied linear model either as either a parameter pair or sklearn object.
data : (mean, cov), numpy.array, pandas.DataFrame, iml.DenseData or scipy.csr_matrix
from ..explainer import Explainer
import numpy as np
class CoefficentExplainer(Explainer):
""" Simply returns the model coefficents as the feature attributions.
This is only for benchmark comparisons and does not approximate SHAP values in a
meaningful way.
"""
def __init__(self, model):
assert hasattr(model, "coef_"), "The passed model does not have a coef_ attribute!"
self.model = model
def attributions(self, X):
return np.tile(self.model.coef_, (X.shape[0], 1))
from .deep_pytorch import PyTorchDeepExplainer
from .deep_tf import TFDeepExplainer
from shap.explainers.explainer import Explainer
class DeepExplainer(Explainer):
""" Meant to approximate SHAP values for deep learning models.
This is an enhanced version of the DeepLIFT algorithm (Deep SHAP) where, similar to Kernel SHAP, we
approximate the conditional expectations of SHAP values using a selection of background samples.
Lundberg and Lee, NIPS 2017 showed that the per node attribution rules in DeepLIFT (Shrikumar,
Greenside, and Kundaje, arXiv 2017) can be chosen to approximate Shapley values. By integrating
over many backgound samples DeepExplainer estimates approximate SHAP values such that they sum
up to the difference between the expected model output on the passed background samples and the
current model output (f(x) - E[f(x)]).
"""
def __init__(self, model, data, session=None, learning_phase_flags=None):
""" An explainer object for a differentiable model using a given background dataset.
Note that the complexity of the method scales linearly with the number of background data
samples. Passing the entire training dataset as `data` will give very accurate expected
import numpy as np
import multiprocessing
import sys
from .explainer import Explainer
try:
import xgboost
except ImportError:
pass
except:
print("xgboost is installed...but failed to load!")
pass
class MimicExplainer(Explainer):
"""Fits a mimic model to the original model and then explains predictions using the mimic model.
Tree SHAP allows for very fast SHAP value explainations of flexible gradient boosted decision
tree (GBDT) models. Since GBDT models are so flexible we can train them to mimic any black-box
model and then using Tree SHAP we can explain them. This won't work well for images, but for
any type of problem that GBDTs do reasonable well on, they should also be able to learn how to
explain black-box models on the data. This mimic explainer also allows you to use a linear model,
but keep in mind that will not do as well at explaining typical non-linear black-box models. In
the future we could include other mimic model types given enough demand/help. Finally, we would
like to note that this explainer is vaugely inspired by https://arxiv.org/abs/1802.07814 where
they learn an explainer that can be applied to any input.
Parameters
----------
model : function or iml.Model