# Fit a linear model
<- lm(Volume ~ Girth, data = trees)
lm_model1
# Use the resid_panel function from ggResidpanel to create a panel of diagnostics plots
::resid_panel(lm_model1) ggResidpanel
Software
Below are descriptions of R packages that I’ve developed.
ggResidpanel
ggResidpanel is an R package Katie Rey and I developed for creating panels of diagnostic plots for residuals from a model using ggplot2. It also allows for the creation of interactive versions of the plots using plotly. The package ic available on CRAN. An overview of the package can be found at this website. The source code can be found in the GitHub repository.
Here is an example panel of diagnostic plots created using ggResidpanel.
limeaid
limeaid is an R package for assessing explanations created using the R package lime. The current implementation was developed to be used with classification models with a binary response and continuous features. The package can be installed from GitHub, and additional information on limeaid is available on the GitHub repository.
Example of a feature heatmap depicting LIME explanation across various tuning parameters for a random forest fit to the iris data.
library(limeaid)
library(randomForest)
# Iris training and testing
<- iris[1:5, 1:4]
iris_test <- iris[-(1:5), 1:4]
iris_train <- iris[[5]][-(1:5)]
iris_lab
# Fit a random forest model to the iris training data
set.seed(20200334)
<- randomForest(Species ~ .,
rf data = cbind(iris_train,
Species = iris_lab))
# Run apply_lime on the iris data
<- apply_lime(
lime_applied train = iris_train,
test = iris_test,
model = rf,
label = "virginica",
n_features = 2,
sim_method = c('quantile_bins',
'equal_bins',
'kernel_density'),
nbins = 2:4,
gower_pow = c(1, 5),
return_perms = TRUE,
seed = 20200334
)
# Extract the explanations from the apply_lime output
<- lime_applied$explain
explanations
# Create a heatmap of the features chosen
plot_feature_heatmap(explanations)
redres
redres is an R package developed to help with diagnosing linear mixed models fit using the function lmer from the lme4 package. It is meant to supplement the lme4
package. redres was created by me, Kellie McClernon, Jing Zhao, Yudi Zhang, and Yonghui Huo as a project for STAT 585 (taught by Dr. Heike Hoffman). The package is currently only available on GitHub, but we hope to update it and add it to CRAN. More information can be found on the package website and GitHub repository.
Here is an example using redres to plot the raw conditional residuals from an lmer model.
library(redres)
library(lme4)
# Fits a linear mixed effects model
<- lmer(height ~ rep + treatment*variety + (1|rep:treatment) + (1|rep:treatment:variety),
m data = paprika)
# Use redres to create a residual plot using the conditional residuals
plot_redres(m)
TreeTracer
TreeTracer is an R package for creating trace plots of trees from random forests fit using the randomForest R package. Trace plots are useful tools for visually comparing trees from a random forest. See Urbanek (2008) for additional information about trace plots. The source code can be found in the GitHub repository.
Here is an example panel of diagnostic plots created using ggResidpanel.
# Load the Palmer penguins data
<- na.omit(palmerpenguins::penguins)
penguins
# Select the features for training the model
<-
penguin_features |>
penguins ::select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
dplyr
# Fit a random forest
set.seed(71)
<-
penguin_rf ::randomForest(
randomForest~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g,
species data = penguins,
ntree = 50
)
# Trace plots of trees in the forest
::trace_plot(
TreeTracerrf = penguin_rf,
train = penguin_features,
tree_ids = 1:penguin_rf$ntree,
alpha = 0.4
)
veesa
VEESA is an R package for explainable machine learning with functional data. The package implements the VEESA pipeline for using elastic shape analysis to model functional data and computes feature importance to explain predictions. VEESA can be downloaded from the GitHub repository.
Here is an example applying the VEESA pipeline.
# Simulate data
= veesa::simulate_functions(M = 100, N = 75, seed = 20211130)
sim_data
# Separate data into train/test
set.seed(20211130)
= unique(sim_data$id)
id = length(id) * 0.25
M_test = sample(x = id, size = M_test, replace = F)
id_test = sim_data |> dplyr::mutate(data = ifelse(id %in% id_test, "test", "train"))
sim_data
# Plot simulated functions colored by covariate
library(ggplot2)
= wesanderson::wes_palette("Zissou1", 5, type = "continuous")
color_pal <-
plot_sim |>
sim_data ggplot(aes(
x = t,
y = y,
color = x1,
group = id
+
)) geom_line(alpha = 0.75) +
scale_color_gradientn(colours = color_pal) +
theme_bw() +
labs(title = "Effect of x1")
# Prepare matrices from the data frames
<- function(df, train_test) {
prep_matrix |>
df ::filter(data == train_test) |>
dplyr::select(id, t, y) |>
dplyr::ungroup() |>
dplyr::pivot_wider(id_cols = t,
tidyrnames_from = id,
values_from = y) |>
::select(-t) |>
dplyras.matrix()
}= prep_matrix(df = sim_data, train_test = "train")
sim_train_matrix = prep_matrix(df = sim_data, train_test = "test")
sim_test_matrix
# Create a vector of times
= sim_data$t |> unique()
times
# Prepare train/test data (joint functional PCs)
<-
train_transformed_jfpca ::prep_training_data(
veesaf = sim_train_matrix,
time = times,
fpca_method = "jfpca",
optim_method = "DPo"
)<-
test_transformed_jfpca ::prep_testing_data(
veesaf = sim_test_matrix,
time = times,
train_prep = train_transformed_jfpca,
optim_method = "DPo"
)
# Response variable
<-
x1_train |>
sim_data ::filter(data == "train") |>
dplyr::select(id, x1) |>
dplyr::distinct() |>
dplyr::pull(x1)
dplyr
# Create data frame with PCs and response
<-
rf_jfpca_df $fpca_res$coef |>
train_transformed_jfpcadata.frame() |>
::rename_all(.funs = function(x) stringr::str_replace(x, "X", "pc")) |>
dplyr::mutate(x1 = x1_train) |>
dplyr::select(x1, everything())
dplyr
# Fit random forest
set.seed(20211130)
= randomForest::randomForest(x1 ~ ., data = rf_jfpca_df)
rf_jfpca
# Compute PFI
set.seed(20211130)
= veesa::compute_pfi(x = rf_jfpca_df |> dplyr::select(-x1), y = rf_jfpca_df$x1, f = rf_jfpca, K = 10, metric = "nmse")
pfi_jfpca
# PFI results
<-
plot_pfi data.frame(pfi = pfi_jfpca$pfi) |>
::mutate(pc = 1:dplyr::n()) |>
dplyrggplot(aes(x = pc, y = pfi)) +
geom_point() +
geom_segment(aes(yend = 0, xend = pc)) +
theme_bw() +
labs(title = "PFI from Joint fPCA")
# Identify the top PC
<-
top_pc_jfpca data.frame(pfi = pfi_jfpca$pfi) |>
::mutate(pc = 1:dplyr::n()) |>
dplyr::arrange(desc(pfi)) |>
dplyr::slice(1) |>
dplyr::pull(pc)
dplyr
# Principal directions of top PC for interpretation
<-
plot_pcdirs ::plot_pc_directions(
veesafpcs = top_pc_jfpca,
fdasrvf = train_transformed_jfpca$fpca_res,
fpca_method = "jfpca",
nrow = 2
+
) scale_color_manual(values = sort(color_pal, decreasing = T)) +
labs(title = "Top PC for Joint fPCA")
# Join plots
library(patchwork)
/ (plot_sim + plot_pcdirs) plot_pfi