Main Functions
lime.data.frame
Inputs
x
: The training data used to fit the model to be explainedmodel
: The model to explain.preprocess = NULL
: Function to transform a character vector to the format expected from the model.bin_continuous = TRUE
: If set toTRUE
, the continuous variables will be binned when making the explanations. If they are not binned, then perturbations will be obtained by either simulating using kernel density estimation or a normal distribution depending on what the option ofuse_density
is set to.n_bins = 4
: The number of bins to use ifbin_continuous = TRUE
.quantile_bins = TRUE
: Should quantile bins be used if `bin_continuous = TRUE? (Otherwise, equally spaced bins will be used.)use_density = TRUE
: This option is only considered ifbin_continuous
is set toFALSE
. In that situation, ifuse_density = TRUE
, then the continuous data will be sampled using kernel density estimation. Otherwise, it will be assumed that the continuous features follow a normal distribution and samples will be drawn from a normal distribution with the mean and standard deviation set to the sample mean and standard deviation associated with the feature.
Procedure
If
preprocess
is NULL, then set it to the identity function. Either way, check to make sure that it is a function.if (is.null(preprocess)) preprocess <- function(x) x assert_that(is.function(preprocess))
Create an explainer object.
explainer <- c(as.list(environment()), list(...)) explainer$x <- NULL
Determine the type of each of the features (e.g. integer, numeric, etc.). If any of the variables are constants, a warning is returned. If the feature type is unknown, an error is returned.
explainer$feature_type <- setNames(sapply(x, function(f) { if (is.integer(f)) { if (length(unique(f)) == 1) "constant" else "integer" } else if (is.numeric(f)) { if (length(unique(f)) == 1) "constant" else "numeric" } else if (is.character(f)) { "character" } else if (is.factor(f)) { "factor" } else if (is.logical(f)) { "logical" } else if (inherits(f, "Date") || inherits(f, "POSIXt")) { "date_time" } else { stop("Unknown feature type", call. = FALSE) } }), names(x)) if (any(explainer$feature_type == "constant")) { warning("Data contains numeric columns with zero variance", call. = FALSE) }
Create bins for numeric and inter type variables. If
quantile_bins = TRUE
, thenn_bins
bins are created using quantiles. Otherwise,n_bins
equally spaced bins are created. Ifquantile_bins = TRUE
andn_bins
< 3, then a warning is produced (“does not contain enough variance to use quantile binning. Using standard binning instead.”) andn_bins
equally spaced bins are returned.explainer$bin_cuts <- setNames(lapply(seq_along(x), function(i) { if (explainer$feature_type[i] %in% c("numeric", "integer")) { if (quantile_bins) { bins <- quantile(x[[i]], seq(0, 1, length.out = n_bins + 1), na.rm = TRUE) bins <- bins[!duplicated(bins)] if (length(bins) < 3) { warning(names(x)[i], " does not contain enough variance to use quantile binning. Using standard binning instead.", call. = FALSE) d_range <- range(x[[i]], na.rm = TRUE) bins <- seq(d_range[1], d_range[2], length.out = n_bins + 1) } bins } else { d_range <- range(x[[i]], na.rm = TRUE) seq(d_range[1], d_range[2], length.out = n_bins + 1) } } }), names(x))
- Determine the “distribution” for each feature. The method used is based on the type of variable.
- Integer: nothing
- Numeric: If
bin_continuous
, then determine the proportion of observations in each bin. Ifuse_density=TRUE
, then kernel a kernel density approximation is used to estimate the distribution. Otherwise, the mean and standard deviation of each of the features is computed. - Character: nothing
- Logical: nothing
Factor: Determine the proportion in each category.
explainer$feature_distribution <- setNames(lapply(seq_along(x), function(i) { switch(explainer$feature_type[i], integer = , numeric = if (bin_continuous) { table(cut(x[[i]], unique(explainer$bin_cuts[[i]]), labels = FALSE, include.lowest = TRUE))/nrow(x) } else if (use_density) { density(x[[i]]) } else { c(mean = mean(x[[i]], na.rm = TRUE), sd = sd(x[[i]], na.rm = TRUE)) }, character = , logical = , factor = table(x[[i]])/nrow(x), NA) }), names(x))
Assign some attributes to the explainer object, and return the explainer object.
structure(explainer, class = c("data_frame_explainer", "explainer", "list"))
Function
# Print the function (and check to see if there have been any changes)
func_check("lime.data.frame", "0.5.0")
## A single object matching 'lime.data.frame' was found
## It was found in the following places
## registered S3 method for lime from namespace lime
## namespace:lime
## with value
##
## function (x, model, preprocess = NULL, bin_continuous = TRUE,
## n_bins = 4, quantile_bins = TRUE, use_density = TRUE, ...)
## {
## if (is.null(preprocess))
## preprocess <- function(x) x
## assert_that(is.function(preprocess))
## explainer <- c(as.list(environment()), list(...))
## explainer$x <- NULL
## explainer$feature_type <- setNames(sapply(x, function(f) {
## if (is.integer(f)) {
## if (length(unique(f)) == 1)
## "constant"
## else "integer"
## }
## else if (is.numeric(f)) {
## if (length(unique(f)) == 1)
## "constant"
## else "numeric"
## }
## else if (is.character(f)) {
## "character"
## }
## else if (is.factor(f)) {
## "factor"
## }
## else if (is.logical(f)) {
## "logical"
## }
## else if (inherits(f, "Date") || inherits(f, "POSIXt")) {
## "date_time"
## }
## else {
## stop("Unknown feature type", call. = FALSE)
## }
## }), names(x))
## if (any(explainer$feature_type == "constant")) {
## warning("Data contains numeric columns with zero variance",
## call. = FALSE)
## }
## explainer$bin_cuts <- setNames(lapply(seq_along(x), function(i) {
## if (explainer$feature_type[i] %in% c("numeric", "integer")) {
## if (quantile_bins) {
## bins <- quantile(x[[i]], seq(0, 1, length.out = n_bins +
## 1), na.rm = TRUE)
## bins <- bins[!duplicated(bins)]
## if (length(bins) < 3) {
## warning(names(x)[i], " does not contain enough variance to use quantile binning. Using standard binning instead.",
## call. = FALSE)
## d_range <- range(x[[i]], na.rm = TRUE)
## bins <- seq(d_range[1], d_range[2], length.out = n_bins +
## 1)
## }
## bins
## }
## else {
## d_range <- range(x[[i]], na.rm = TRUE)
## seq(d_range[1], d_range[2], length.out = n_bins +
## 1)
## }
## }
## }), names(x))
## explainer$feature_distribution <- setNames(lapply(seq_along(x),
## function(i) {
## switch(explainer$feature_type[i], integer = , numeric = if (bin_continuous) {
## table(cut(x[[i]], unique(explainer$bin_cuts[[i]]),
## labels = FALSE, include.lowest = TRUE))/nrow(x)
## } else if (use_density) {
## density(x[[i]])
## } else {
## c(mean = mean(x[[i]], na.rm = TRUE), sd = sd(x[[i]],
## na.rm = TRUE))
## }, character = , logical = , factor = table(x[[i]])/nrow(x),
## NA)
## }), names(x))
## structure(explainer, class = c("data_frame_explainer", "explainer",
## "list"))
## }
## <bytecode: 0x7f863cfa6ab0>
## <environment: namespace:lime>
explain.data.frame
Inputs
x
: The new observations to explain (with the same format as used when creating the explainer).explainer
: This is the object output from thelime
function.labels = NULL
: The specific labels (classes) to explain in case the model is a classifier. For classifiers either this orn_labels
must be given.n_labels = NULL
: The number of labels to explain. If this is given for classifiers, the topn_label
classes will be explained.n_features
: The number of features to use for each explanation. (i.e. The number of features to select during feature selection.)n_permutations = 5000
: The number of perturbations generated for each feature.feature_select = 'auto'
: This is the feature selection method for choosing the number of features specified.auto
: Ifn_features
\(\le 6\), usesforward_selection
. Otherwise,highest_weights
is used.none
: Use all features for the explanation. Not advised unless you have very few features.forward_selection
: Add one feature at a time untiln_features
is reached, based on quality of a ridge regression model.highest_weights
: Then_features
with highest absolute weight in a ridge regression fit of the complex model outcome are chosen.lasso_path
: Fit a lasso model and choose then_features
whose lars path converge to zero the latest.tree
: Fit a tree to selectn_features
(which needs to be a power of 2). It requires last version of XGBoost.
dist_fun = 'gower'
: The distance function to use for calculating the distance from the observation to the permutations. Ifdist_fun = 'gower'
(default) it will usegower::gower_dist()
. Otherwise it will be forwarded tostats::dist()
.kernel_width = NULL
:The width of the exponential kernel that will be used to convert the distance to a similarity in casedist_fun != 'gower'
.gower_pow = 1
: A modifier for gower distance. The calculated distance will be raised to the power of this value.
Procedure
Check to make sure that
explainer
is an object output from thelime
function.assertthat::assert_that(is.data_frame_explainer(explainer))
Determine the type of the model that is contained within the explainer function (using the
model_type
function), and determine the type of output that should be produced byexplain
based on the model type (using theoutput_type
function).m_type <- model_type(explainer) o_type <- output_type(explainer)
If the model type is regression, check to make sure both
labels
andn_labels
are set to NULL. Otherwise, return a warning saying that labels and n_labels are ignored with explaining regression models. Then setn_lables
to 1 andlabels
to NULL.if (m_type == "regression") { if (!is.null(labels) || !is.null(n_labels)) { warning("\"labels\" and \"n_labels\" arguments are ignored when explaining regression models") } n_labels <- 1 labels <- NULL }
Check to make sure that only
labels
orn_labels
is specified and the other is NULL. If not, output a warning saying that you must choose between one or the other.assert_that(is.null(labels) + is.null(n_labels) == 1, msg = "You need to choose between labels and n_labels parameters.")
Return an error if
n_features
orn_permutations
are not specified as counts.assert_that(is.count(n_features)) assert_that(is.count(n_permutations))
If the kernel width has not been specified, then compute it as the square root of the number of columns of the testing dataframe multiplied by 0.75. Then create a kernel function that will be used to weight the perturbations based on the specified kernel width using the function
exp_kernel
. The formula used byexp_kernel
is \[f(x, w)=\sqrt{\exp\left(\frac{-(x^2)}{w}\right)}\] where \(w\) is the kernel width.if (is.null(kernel_width)) { kernel_width <- sqrt(ncol(x)) * 0.75 } kernel <- exp_kernel(kernel_width)
Makes use of the
permute_cases
function for data frames in the lime package. This is the function that creates the perturbations. One perturbation is created for each of then_permuations
specified.case_perm <- permute_cases(x, n_permutations, explainer$feature_distribution, explainer$bin_continuous, explainer$bin_cuts, explainer$use_density)
Obtains predictions for all of the permutated data points based on the original model using the
predict_model
function. Use the functionset_labels
case_res <- predict_model(explainer$model, explainer$preprocess(case_perm), type = o_type, ...) case_res <- set_labels(case_res, explainer$model)
Creates indicies for each of the perturbations and dividing them into groups based on which new prediction case they correspond to
case_ind <- split(seq_len(nrow(case_perm)), rep(seq_len(nrow(x)), each = n_permutations))
Fill in the results obejct
res <- lapply(seq_along(case_ind), function(ind) {
Create a vector of indicies for one of the observations
i <- case_ind[[ind]]
If the distance function is specified to be “gower”, then compute the distance between the case of interest and each observation in the permuted dataset. These distances get raised to whatever the specified
gower_pow
is. Then these values are subtracted from one so that observations closer to the case of interest have a higher “weight” associated with them.if (dist_fun == "gower") { sim <- 1 - (gower_dist(case_perm[i[1], , drop = FALSE], case_perm[i, , drop = FALSE]))^gower_pow }
Turns the factor and character variables into numeric variables and determines whether a permutation falls into the test data bin if bins_continuous is set to TRUE. This uses the
numerify
function.perms <- numerify(case_perm[i, ], explainer$feature_type, explainer$bin_continuous, explainer$bin_cuts)
If the distance is not set to “gower”, then distance between the case of interest and the permutations is computed using the kernel function. This also makes use of the
feature_scale
function.if (dist_fun != "gower") { sim <- kernel(c(0, dist(feature_scale(perms, explainer$feature_distribution, explainer$feature_type, explainer$bin_continuous), method = dist_fun)[seq_len(n_permutations - 1)])) }
Perform feature selection and fit a model to the permutations using these important variables using the function
model_permutations
.res <- model_permutations(as.matrix(perms), case_res[i, , drop = FALSE], sim, labels, n_labels, n_features, feature_select)
Finish filling in the rest of the results object with information about bins and such. This makes use of the
describe_feature
function.res$feature_value <- unlist(case_perm[i[1], res$feature]) res$feature_desc <- describe_feature(res$feature, case_perm[i[1], ], explainer$feature_type, explainer$bin_continuous, explainer$bin_cuts) guess <- which.max(abs(case_res[i[1], ])) res$case <- rownames(x)[ind] res$label_prob <- unname(as.matrix(case_res[i[1], ]))[match(res$label, colnames(case_res))] res$data <- list(as.list(case_perm[i[1], ])) res$prediction <- list(as.list(case_res[i[1], ])) res$model_type <- m_type res
Join the results and assign names to the columns
res <- do.call(rbind, res) res <- res[, c('model_type', 'case', 'label', 'label_prob', 'model_r2', 'model_intercept', 'model_prediction', 'feature', 'feature_value', 'feature_weight', 'feature_desc', 'data', 'prediction')] if (m_type == 'regression') { res$label <- NULL res$label_prob <- NULL res$prediction <- unlist(res$prediction) }
Return the results object
res
Function
# Print the function (and check to see if there have been any changes)
func_check("explain.data.frame", "0.5.0")
## A single object matching 'explain.data.frame' was found
## It was found in the following places
## namespace:lime
## with value
##
## function (x, explainer, labels = NULL, n_labels = NULL, n_features,
## n_permutations = 5000, feature_select = "auto", dist_fun = "gower",
## kernel_width = NULL, gower_pow = 1, ...)
## {
## assert_that(is.data_frame_explainer(explainer))
## m_type <- model_type(explainer)
## o_type <- output_type(explainer)
## if (m_type == "regression") {
## if (!is.null(labels) || !is.null(n_labels)) {
## warning("\"labels\" and \"n_labels\" arguments are ignored when explaining regression models")
## }
## n_labels <- 1
## labels <- NULL
## }
## assert_that(is.null(labels) + is.null(n_labels) == 1, msg = "You need to choose between labels and n_labels parameters.")
## assert_that(is.count(n_features))
## assert_that(is.count(n_permutations))
## if (is.null(kernel_width)) {
## kernel_width <- sqrt(ncol(x)) * 0.75
## }
## kernel <- exp_kernel(kernel_width)
## case_perm <- permute_cases(x, n_permutations, explainer$feature_distribution,
## explainer$bin_continuous, explainer$bin_cuts, explainer$use_density)
## case_res <- predict_model(explainer$model, explainer$preprocess(case_perm),
## type = o_type, ...)
## case_res <- set_labels(case_res, explainer$model)
## case_ind <- split(seq_len(nrow(case_perm)), rep(seq_len(nrow(x)),
## each = n_permutations))
## res <- lapply(seq_along(case_ind), function(ind) {
## i <- case_ind[[ind]]
## if (dist_fun == "gower") {
## sim <- 1 - (gower_dist(case_perm[i[1], , drop = FALSE],
## case_perm[i, , drop = FALSE]))^gower_pow
## }
## perms <- numerify(case_perm[i, ], explainer$feature_type,
## explainer$bin_continuous, explainer$bin_cuts)
## if (dist_fun != "gower") {
## sim <- kernel(c(0, dist(feature_scale(perms, explainer$feature_distribution,
## explainer$feature_type, explainer$bin_continuous),
## method = dist_fun)[seq_len(n_permutations - 1)]))
## }
## res <- model_permutations(as.matrix(perms), case_res[i,
## , drop = FALSE], sim, labels, n_labels, n_features,
## feature_select)
## res$feature_value <- unlist(case_perm[i[1], res$feature])
## res$feature_desc <- describe_feature(res$feature, case_perm[i[1],
## ], explainer$feature_type, explainer$bin_continuous,
## explainer$bin_cuts)
## guess <- which.max(abs(case_res[i[1], ]))
## res$case <- rownames(x)[ind]
## res$label_prob <- unname(as.matrix(case_res[i[1], ]))[match(res$label,
## colnames(case_res))]
## res$data <- list(as.list(case_perm[i[1], ]))
## res$prediction <- list(as.list(case_res[i[1], ]))
## res$model_type <- m_type
## res
## })
## res <- do.call(rbind, res)
## res <- res[, c("model_type", "case", "label", "label_prob",
## "model_r2", "model_intercept", "model_prediction", "feature",
## "feature_value", "feature_weight", "feature_desc", "data",
## "prediction")]
## if (m_type == "regression") {
## res$label <- NULL
## res$label_prob <- NULL
## res$prediction <- unlist(res$prediction)
## }
## as_tibble(res)
## }
## <bytecode: 0x7f861f92fa50>
## <environment: namespace:lime>
Implementation
library(randomForest)
x <- hamby224_test %>%
arrange(case) %>%
select(rf_features) %>%
na.omit()
explainer <- lime:::lime.data.frame(x = hamby173and252_train %>%
select(rf_features),
y = hamby173and252_train %>%
select(samesource),
model = as_classifier(bulletr::rtrees))
label = TRUE
n_labels = NULL
n_features = 3
n_permutations = 5000
feature_select = 'auto'
dist_fun = "gower"
kernel_width = NULL
gower_pow = 1
assertthat::assert_that(lime:::is.data_frame_explainer(explainer))
## [1] TRUE
#m_type <- lime:::model_type(explainer)
o_type <- lime:::output_type(explainer)
assert_that(is.null(labels) + is.null(n_labels) == 1,
msg = "You need to choose between labels and n_labels parameters.")
## [1] TRUE
assert_that(is.count(n_features))
## [1] TRUE
assert_that(is.count(n_permutations))
## [1] TRUE
if (is.null(kernel_width)) {
kernel_width <- sqrt(ncol(x)) * 0.75
}
kernel <- lime:::exp_kernel(kernel_width)
case_perm <- lime:::permute_cases.data.frame(x,
n_permutations,
explainer$feature_distribution,
explainer$bin_continuous,
explainer$bin_cuts,
explainer$use_density)
head(case_perm)
## ccf rough_cor D sd_D matches mismatches
## 1 0.27042215 0.2704221 0.001396367 0.002090586 0.5901742 7.616146
## 2 0.28724882 -0.1611515 1.013499988 3.019210854 2.9464410 9.556052
## 3 0.30790540 0.4078556 2.811572048 2.010342687 1.2441031 9.978211
## 4 0.04436213 0.9178336 2.110390058 6.498977479 1.6576612 25.502945
## 5 0.27896222 -0.1015974 3.398931097 1.996186637 3.0527158 1.367676
## 6 0.25113254 -0.1527809 4.492328696 4.553291926 16.8560805 32.680021
## cms non_cms sum_peaks
## 1 0.59017419 3.808073 1.463209
## 2 7.52621104 4.334388 20.818953
## 3 1.32867089 32.760953 3.568589
## 4 0.85718517 5.956968 23.968347
## 5 0.18317852 18.112314 12.565722
## 6 0.07182718 1.122142 4.000728
case_res <- lime:::predict_model(explainer$model,
explainer$preprocess(case_perm),
type = o_type)
case_res <- lime:::set_labels(case_res, explainer$model)
head(case_res)
## FALSE TRUE
## 1 1.0000000 0.0000000
## 2 0.8066667 0.1933333
## 3 0.6633333 0.3366667
## 4 0.7766667 0.2233333
## 5 0.6833333 0.3166667
## 6 0.4833333 0.5166667
case_ind <- split(seq_len(nrow(case_perm)), rep(seq_len(nrow(x)), each = n_permutations))
case_ind[[1]][1:10]
## [1] 1 2 3 4 5 6 7 8 9 10
case_ind[[2]][1:10]
## [1] 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010
case_ind[[3]][1:10]
## [1] 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010
str(seq_along(case_ind))
## int [1:364] 1 2 3 4 5 6 7 8 9 10 ...
i <- case_ind[[1]]
str(i)
## int [1:5000] 1 2 3 4 5 6 7 8 9 10 ...
if (dist_fun == "gower") {
sim <- 1 - (gower_dist(case_perm[i[1], , drop = FALSE],
case_perm[i, , drop = FALSE]))^gower_pow
}
str(sim)
## num [1:5000] 1 0.765 0.82 0.653 0.795 ...
perms <- lime:::numerify(case_perm[i, ],
explainer$feature_type,
explainer$bin_continuous,
explainer$bin_cuts)
head(perms)
## ccf rough_cor D sd_D matches mismatches cms non_cms sum_peaks
## 1 1 1 1 1 1 1 1 1 1
## 2 0 0 1 0 0 0 0 1 0
## 3 0 1 0 1 0 0 0 0 0
## 4 0 1 0 0 0 0 0 0 0
## 5 0 0 0 1 0 1 1 0 0
## 6 1 0 0 0 0 0 1 0 0
# error?
# res <- lime:::model_permutations(as.matrix(perms),
# case_res[i, , drop = FALSE],
# sim,
# labels,
# n_labels,
# n_features,
# feature_select)
Comment on Faithfulness
…our choice of \(G\) (sparse linear models) means that if the underlying model is highly non-linear even in the locality of the prediction, there may not be a faithful explanation. However, we can estimate the faithfulness of the explanation on \(\mathcal{Z}\), and present this information to the user. This estimate of faithfulness can also be used for selecting an appropriate family of explanations from a set of multiple interpretable model classes, thus adapting to the given dataset and the classifier. We leave such exploration for future work, as linear explanations work quite well for multiple black-box models in our experiments.