# ------------------------------------ # Author: Andreas Alfons # Erasmus University Rotterdam # ------------------------------------ ## Load packages library("robustHD") library("pense") ## Prepare the data # Let's use the NCI-60 cancer cell panel data("nci60") # Let's see how the NCI-60 cancer cell data is stored ?nci60 # Define response variable: protein expression with the largest MAD MADs <- apply(protein, 2, mad) y <- protein[, which.max(MADs)] # Screen the d=100 most correlated predictor variables: use robust # correlations as implemented in corHuber() ?corHuber correlations <- apply(gene, 2, corHuber, y) # Function partialOrder() avoids to sort all >22000 gene expressions keep <- partialOrder(abs(correlations), 100, decreasing = TRUE) X <- gene[, keep] # We still have a high-dimensional data set dim(X) ## Sparse least trimmed squares # Define a grid for the penalty parameter lambda: all values need # to be >0 in the high-dimensional setting lambda <- seq(0.01, 0.5, length.out = 10) # We can aslo *estimate* the smallest lambda that will set all # coefficients to 0 lambda0(X, y) # Function sparseLTS() selects the optimal lambda and then fits # the final model with that optimal lambda ?sparseLTS fit_BIC <- sparseLTS(X, y, lambda = lambda, mode = "fraction", crit = "BIC", seed = 20210507) # Check the BIC for the different values of the penalty parameter # lambda with critPlot(): critPlot(fit_BIC) # Here, we see a limitation of using the BIC in the high-dimensional # setting: For very small lambda, we get very close to an exact fit, # causing this sharp decrease in BIC for lambda = 0.01. Hence the # BIC is unstable in high dimensions for small values of lambda! # Let's use cross-validation instead: here we use a simple 5-fold CV fit_sLTS <- sparseLTS(X, y, lambda = lambda, mode = "fraction", crit = "PE", splits = foldControl(K = 5, R = 1), seed = 20210507) # It takes a lot longer to compute, but the selection of the penalty # parameter is stable: critPlot(fit_sLTS) # Let's see what the final model fit looks like: fit_sLTS # We have selected 17 genes into the model, which is a whole lot # easier to interpret for a medical researcher than the 100 # initially screened genes (let alone the >22000 original genes). # We should always look at a regression diagnostic plot of the # final model fit: diagnosticPlot(fit_sLTS, which = "rdiag") # There are quite a few outlying cancer cells! # If computation time is an issue, we can use parallel computing # to speed things up. Try that as an exercise! ## Robust least angle regression # There is a close relationship for the non-robust versions of # lasso and least angle regression, but the same does not hold # for their different robustifications! # Function rlars() selects the optimal lambda and then fits # the final model with that optimal lambda. Let's use 5-fold # cross-validation again for comparability. ?rlars fit_rlars <- rlars(X, y, sMax = 30, crit = "PE", splits = foldControl(K = 5, R = 1), seed = 20210507) # This gave warnings, so let's fix that by supplying control # arguments to lmrob(), which is used to estimate the submodels # along the sequence. fit_rlars <- rlars(X, y, sMax = 30, crit = "PE", splits = foldControl(K = 5, R = 1), regArgs = list(maxit.scale = 500, cov = ".vcov.w"), seed = 20210507) # Let's look at the estimated prediction errors for the # different submodels along the sequence. critPlot(fit_rlars) # Let's see what the final model fit looks like: fit_rlars # We have even fewer genes that are selected, namely 12. # Of course, we should again look at a regression diagnostic # plot of the final model fit: diagnosticPlot(fit_rlars, which = "rdiag") ## Penalized elastic net S-estimator # Let's set alpha = 1 so that we get the lasso for # compatibility. We first need to use function # pense_cv() to perform cross-validation. ?pense_cv set.seed(20210507) fit_pense <- pense_cv(X, y, alpha = 1, nlambda = 10, cv_k = 5, cv_repl = 1, fit_all = "min") # Let's look at the estimated prediction errors for the # different submodels along the sequence. plot(fit_pense, what = "cv") # Let's see what the final model fit looks like: fit_pense # We have selected 16 genes. So overall the number of genes # is not too different for the different methods.