# ------------------------------------
# Author: Andreas Alfons
# Erasmus University Rotterdam
# ------------------------------------
## Load packages
library("robustHD")
library("pense")
## Prepare the data
# Let's use the NCI-60 cancer cell panel
data("nci60")
# Let's see how the NCI-60 cancer cell data is stored
?nci60
# Define response variable: protein expression with the largest MAD
MADs <- apply(protein, 2, mad)
y <- protein[, which.max(MADs)]
# Screen the d=100 most correlated predictor variables: use robust
# correlations as implemented in corHuber()
?corHuber
correlations <- apply(gene, 2, corHuber, y)
# Function partialOrder() avoids to sort all >22000 gene expressions
keep <- partialOrder(abs(correlations), 100, decreasing = TRUE)
X <- gene[, keep]
# We still have a high-dimensional data set
dim(X)
## Sparse least trimmed squares
# Define a grid for the penalty parameter lambda: all values need
# to be >0 in the high-dimensional setting
lambda <- seq(0.01, 0.5, length.out = 10)
# We can aslo *estimate* the smallest lambda that will set all
# coefficients to 0
lambda0(X, y)
# Function sparseLTS() selects the optimal lambda and then fits
# the final model with that optimal lambda
?sparseLTS
fit_BIC <- sparseLTS(X, y, lambda = lambda, mode = "fraction",
crit = "BIC", seed = 20210507)
# Check the BIC for the different values of the penalty parameter
# lambda with critPlot():
critPlot(fit_BIC)
# Here, we see a limitation of using the BIC in the high-dimensional
# setting: For very small lambda, we get very close to an exact fit,
# causing this sharp decrease in BIC for lambda = 0.01. Hence the
# BIC is unstable in high dimensions for small values of lambda!
# Let's use cross-validation instead: here we use a simple 5-fold CV
fit_sLTS <- sparseLTS(X, y, lambda = lambda, mode = "fraction",
crit = "PE", splits = foldControl(K = 5, R = 1),
seed = 20210507)
# It takes a lot longer to compute, but the selection of the penalty
# parameter is stable:
critPlot(fit_sLTS)
# Let's see what the final model fit looks like:
fit_sLTS
# We have selected 17 genes into the model, which is a whole lot
# easier to interpret for a medical researcher than the 100
# initially screened genes (let alone the >22000 original genes).
# We should always look at a regression diagnostic plot of the
# final model fit:
diagnosticPlot(fit_sLTS, which = "rdiag")
# There are quite a few outlying cancer cells!
# If computation time is an issue, we can use parallel computing
# to speed things up. Try that as an exercise!
## Robust least angle regression
# There is a close relationship for the non-robust versions of
# lasso and least angle regression, but the same does not hold
# for their different robustifications!
# Function rlars() selects the optimal lambda and then fits
# the final model with that optimal lambda. Let's use 5-fold
# cross-validation again for comparability.
?rlars
fit_rlars <- rlars(X, y, sMax = 30, crit = "PE",
splits = foldControl(K = 5, R = 1),
seed = 20210507)
# This gave warnings, so let's fix that by supplying control
# arguments to lmrob(), which is used to estimate the submodels
# along the sequence.
fit_rlars <- rlars(X, y, sMax = 30, crit = "PE",
splits = foldControl(K = 5, R = 1),
regArgs = list(maxit.scale = 500,
cov = ".vcov.w"),
seed = 20210507)
# Let's look at the estimated prediction errors for the
# different submodels along the sequence.
critPlot(fit_rlars)
# Let's see what the final model fit looks like:
fit_rlars
# We have even fewer genes that are selected, namely 12.
# Of course, we should again look at a regression diagnostic
# plot of the final model fit:
diagnosticPlot(fit_rlars, which = "rdiag")
## Penalized elastic net S-estimator
# Let's set alpha = 1 so that we get the lasso for
# compatibility. We first need to use function
# pense_cv() to perform cross-validation.
?pense_cv
set.seed(20210507)
fit_pense <- pense_cv(X, y, alpha = 1, nlambda = 10,
cv_k = 5, cv_repl = 1,
fit_all = "min")
# Let's look at the estimated prediction errors for the
# different submodels along the sequence.
plot(fit_pense, what = "cv")
# Let's see what the final model fit looks like:
fit_pense
# We have selected 16 genes. So overall the number of genes
# is not too different for the different methods.