# ------------------------------------
# Author: Andreas Alfons
#         Erasmus University Rotterdam
# ------------------------------------

## Load packages
library("robustHD")
library("pense")


## Prepare the data

# Let's use the NCI-60 cancer cell panel
data("nci60")

# Let's see how the NCI-60 cancer cell data is stored
?nci60

# Define response variable: protein expression with the largest MAD
MADs <- apply(protein, 2, mad)
y <- protein[, which.max(MADs)]

# Screen the d=100 most correlated predictor variables: use robust
# correlations as implemented in corHuber()
?corHuber
correlations <- apply(gene, 2, corHuber, y)
# Function partialOrder() avoids to sort all >22000 gene expressions
keep <- partialOrder(abs(correlations), 100, decreasing = TRUE)
X <- gene[, keep]

# We still have a high-dimensional data set
dim(X)


## Sparse least trimmed squares

# Define a grid for the penalty parameter lambda: all values need
# to be >0 in the high-dimensional setting
lambda <- seq(0.01, 0.5, length.out = 10)

# We can aslo *estimate* the smallest lambda that will set all
# coefficients to 0
lambda0(X, y)

# Function sparseLTS() selects the optimal lambda and then fits
# the final model with that optimal lambda
?sparseLTS
fit_BIC <- sparseLTS(X, y, lambda = lambda, mode = "fraction",
                     crit = "BIC", seed = 20210507)

# Check the BIC for the different values of the penalty parameter
# lambda with critPlot():
critPlot(fit_BIC)

# Here, we see a limitation of using the BIC in the high-dimensional
# setting: For very small lambda, we get very close to an exact fit,
# causing this sharp decrease in BIC for lambda = 0.01.  Hence the
# BIC is unstable in high dimensions for small values of lambda!


# Let's use cross-validation instead: here we use a simple 5-fold CV
fit_sLTS <- sparseLTS(X, y, lambda = lambda, mode = "fraction",
                      crit = "PE", splits = foldControl(K = 5, R = 1),
                      seed = 20210507)

# It takes a lot longer to compute, but the selection of the penalty
# parameter is stable:
critPlot(fit_sLTS)

# Let's see what the final model fit looks like:
fit_sLTS

# We have selected 17 genes into the model, which is a whole lot
# easier to interpret for a medical researcher than the 100
# initially screened genes (let alone the >22000 original genes).

# We should always look at a regression diagnostic plot of the
# final model fit:
diagnosticPlot(fit_sLTS, which = "rdiag")

# There are quite a few outlying cancer cells!

# If computation time is an issue, we can use parallel computing
# to speed things up.  Try that as an exercise!


## Robust least angle regression

# There is a close relationship for the non-robust versions of
# lasso and least angle regression, but the same does not hold
# for their different robustifications!

# Function rlars() selects the optimal lambda and then fits
# the final model with that optimal lambda. Let's use 5-fold
# cross-validation again for comparability.
?rlars
fit_rlars <- rlars(X, y, sMax = 30, crit = "PE",
                   splits = foldControl(K = 5, R = 1),
                   seed = 20210507)

# This gave warnings, so let's fix that by supplying control
# arguments to lmrob(), which is used to estimate the submodels
# along the sequence.
fit_rlars <- rlars(X, y, sMax = 30, crit = "PE",
                   splits = foldControl(K = 5, R = 1),
                   regArgs = list(maxit.scale = 500,
                                  cov = ".vcov.w"),
                   seed = 20210507)

# Let's look at the estimated prediction errors for the
# different submodels along the sequence.
critPlot(fit_rlars)

# Let's see what the final model fit looks like:
fit_rlars

# We have even fewer genes that are selected, namely 12.

# Of course, we should again look at a regression diagnostic
# plot of the final model fit:
diagnosticPlot(fit_rlars, which = "rdiag")


## Penalized elastic net S-estimator

# Let's set alpha = 1 so that we get the lasso for
# compatibility.  We first need to use function
# pense_cv() to perform cross-validation.
?pense_cv
set.seed(20210507)
fit_pense <- pense_cv(X, y, alpha = 1, nlambda = 10,
                     cv_k = 5, cv_repl = 1,
                     fit_all = "min")

# Let's look at the estimated prediction errors for the
# different submodels along the sequence.
plot(fit_pense, what = "cv")

# Let's see what the final model fit looks like:
fit_pense

# We have selected 16 genes.  So overall the number of genes
# is not too different for the different methods.