Best subset selection
- Exercise 1
- Exercise 2 (variable selection)
Penalized regression
- Exercise 3 (Lasso and ridge for ozone data)
- Exercise 4

We need the following packages:

library(tidyverse)
library(leaps)
library(glmnet)
library(mlbench)
library(doMC)
library(caret)
library(class)
library(reshape2)
library(plotROC)
library(kernlab)
library(bestglm)

Best subset selection

Exercise 1

We consider the ozone data set from the mlbench package

data("Ozone")
summary(Ozone)

##        V1            V2      V3           V4              V5      
##  1      : 31   1      : 12   1:52   Min.   : 1.00   Min.   :5320  
##  3      : 31   2      : 12   2:52   1st Qu.: 5.00   1st Qu.:5700  
##  5      : 31   3      : 12   3:52   Median : 9.00   Median :5770  
##  7      : 31   4      : 12   4:53   Mean   :11.53   Mean   :5753  
##  8      : 31   5      : 12   5:53   3rd Qu.:16.00   3rd Qu.:5830  
##  10     : 31   6      : 12   6:52   Max.   :38.00   Max.   :5950  
##  (Other):180   (Other):294   7:52   NA's   :5       NA's   :12    
##        V6               V7              V8              V9       
##  Min.   : 0.000   Min.   :19.00   Min.   :25.00   Min.   :27.68  
##  1st Qu.: 3.000   1st Qu.:49.00   1st Qu.:51.00   1st Qu.:49.73  
##  Median : 5.000   Median :65.00   Median :62.00   Median :57.02  
##  Mean   : 4.869   Mean   :58.48   Mean   :61.91   Mean   :56.85  
##  3rd Qu.: 6.000   3rd Qu.:73.00   3rd Qu.:72.00   3rd Qu.:66.11  
##  Max.   :11.000   Max.   :93.00   Max.   :93.00   Max.   :82.58  
##                   NA's   :15      NA's   :2       NA's   :139    
##       V10            V11             V12             V13       
##  Min.   : 111   Min.   :-69.0   Min.   :27.50   Min.   :  0.0  
##  1st Qu.: 890   1st Qu.:-10.0   1st Qu.:51.26   1st Qu.: 70.0  
##  Median :2125   Median : 24.0   Median :62.24   Median :110.0  
##  Mean   :2591   Mean   : 17.8   Mean   :60.93   Mean   :123.3  
##  3rd Qu.:5000   3rd Qu.: 45.0   3rd Qu.:70.52   3rd Qu.:150.0  
##  Max.   :5000   Max.   :107.0   Max.   :91.76   Max.   :500.0  
##  NA's   :15     NA's   :1       NA's   :14

We first delete individuals with missing data and the three first variables which contains the date of the observations:

donnees <- Ozone %>% na.omit()
donnees <- donnees[,-c(1:3)]
summary(donnees)

##        V4              V5             V6               V7       
##  Min.   : 1.00   Min.   :5320   Min.   : 0.000   Min.   :19.00  
##  1st Qu.: 5.00   1st Qu.:5690   1st Qu.: 3.000   1st Qu.:46.00  
##  Median : 9.00   Median :5760   Median : 5.000   Median :64.00  
##  Mean   :11.37   Mean   :5746   Mean   : 4.867   Mean   :57.61  
##  3rd Qu.:16.00   3rd Qu.:5830   3rd Qu.: 6.000   3rd Qu.:73.00  
##  Max.   :38.00   Max.   :5950   Max.   :11.000   Max.   :93.00  
##        V8              V9             V10            V11        
##  Min.   :25.00   Min.   :27.68   Min.   : 111   Min.   :-69.00  
##  1st Qu.:51.50   1st Qu.:49.64   1st Qu.: 869   1st Qu.:-14.00  
##  Median :61.00   Median :56.48   Median :2083   Median : 18.00  
##  Mean   :61.11   Mean   :56.54   Mean   :2602   Mean   : 14.43  
##  3rd Qu.:71.00   3rd Qu.:66.20   3rd Qu.:5000   3rd Qu.: 43.00  
##  Max.   :93.00   Max.   :82.58   Max.   :5000   Max.   :107.00  
##       V12             V13       
##  Min.   :27.50   Min.   :  0.0  
##  1st Qu.:51.26   1st Qu.: 60.0  
##  Median :60.98   Median :100.0  
##  Mean   :60.69   Mean   :122.2  
##  3rd Qu.:70.88   3rd Qu.:150.0  
##  Max.   :90.68   Max.   :350.0

We then split the data into:

a training set of size 150;
a test set of size 53.

set.seed(1234)
perm <- sample(nrow(donnees))
ind.train <- perm[1:150]
ind.test <- perm[-c(1:150)]
train <- donnees[ind.train,]
test <- donnees[ind.test,]

The problem is to explain the Daily maximum one-hour-average ozone reading (V4) by the other variables. We first consider the linear model

linear.model <- lm(V4~.,data=train)
summary(linear.model)

## 
## Call:
## lm(formula = V4 ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8226 -2.7434 -0.6538  2.9229 14.2906 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 18.7498441 43.3205526   0.433  0.66581   
## V5          -0.0055941  0.0081420  -0.687  0.49318   
## V6           0.1749939  0.1921661   0.911  0.36405   
## V7           0.0867529  0.0270191   3.211  0.00164 **
## V8           0.1466358  0.0769024   1.907  0.05860 . 
## V9           0.4107676  0.1463569   2.807  0.00572 **
## V10         -0.0012717  0.0004349  -2.924  0.00403 **
## V11         -0.0052064  0.0164637  -0.316  0.75230   
## V12         -0.1574278  0.1330782  -1.183  0.23883   
## V13         -0.0051887  0.0059843  -0.867  0.38739   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.307 on 140 degrees of freedom
## Multiple R-squared:  0.7274, Adjusted R-squared:  0.7098 
## F-statistic:  41.5 on 9 and 140 DF,  p-value: < 2.2e-16

It seems that some variables are not necessary in the model.

Explain the output of the following command

mod.sel <- regsubsets(V4~.,data=train)
summary(mod.sel)

## Subset selection object
## Call: regsubsets.formula(V4 ~ ., data = train)
## 9 Variables  (and intercept)
##     Forced in Forced out
## V5      FALSE      FALSE
## V6      FALSE      FALSE
## V7      FALSE      FALSE
## V8      FALSE      FALSE
## V9      FALSE      FALSE
## V10     FALSE      FALSE
## V11     FALSE      FALSE
## V12     FALSE      FALSE
## V13     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          V5  V6  V7  V8  V9  V10 V11 V12 V13
## 1  ( 1 ) " " " " " " "*" " " " " " " " " " "
## 2  ( 1 ) " " " " "*" " " "*" " " " " " " " "
## 3  ( 1 ) " " " " "*" " " "*" "*" " " " " " "
## 4  ( 1 ) " " " " "*" "*" "*" "*" " " " " " "
## 5  ( 1 ) " " " " "*" "*" "*" "*" " " "*" " "
## 6  ( 1 ) " " "*" "*" "*" "*" "*" " " "*" " "
## 7  ( 1 ) " " "*" "*" "*" "*" "*" " " "*" "*"
## 8  ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*"

We obtain the best models with \(1,2,\dots,8\) variables according to the \(R^2\).

Select the model which optimize Mallows’s \(C_p\) and \(BIC\) criteria.

plot(mod.sel,scale="bic")

plot(mod.sel,scale="Cp")

BIC selects \[V_4=\beta_0+\beta_1V_7+\beta_2V_9+\beta_3V_{10}+\varepsilon\] while Mallows’s \(C_p\) selects \[V_4=\beta_0+\beta_1V_5+\beta_2V_7+\beta_3V_8+\beta_4V_9+\beta_5V_{10}+\varepsilon.\] Thus,

mod.bic <- lm(V4~V7+V9+V10,data=train)
mod.cp <- lm(V4~V5+V7+V8+V9+V10,data=train)

We consider the quadratic risk for the three models: \[E[(Y-\widehat m(X))^2].\] This risk is estimated with the test set according to \[\frac{1}{n_{test}}\sum_{i\in test}(Y_i-\widehat m(X_i))^2.\] Compute the estimated risks for the three linear models:

prev <- data.frame(Y=test$V4,lin=predict(linear.model,newdata=test),BIC=predict(mod.bic,newdata=test),Cp=predict(mod.cp,newdata=test))
prev %>% summarize(Err_lin=mean((Y-lin)^2),Err_BIC=mean((Y-BIC)^2),Err_Cp=mean((Y-Cp)^2))

Cp has the smallest estimated quadratic error.

Exercise 2 (variable selection)

We consider the model \[Y=\beta_0+\beta_1X_1+\dots+\beta_pX_p+\varepsilon\] where \(X_1,\dots,X_p\) are independent variables with distribution \(\mathcal N(0,1)\) and \(\varepsilon\) has distribution \(\mathcal N(0,0.5^2)\) (\(X=(X_1,\dots,X_p)\) and \(\varepsilon\) are independent).

We assume that \(p=105\) and that

\(\beta_0=0\), \(\beta_1=\dots=\beta_5=1\)
\(\beta_6=\dots=\beta_{105}=0\). So here, only variables \(X_1,\dots,X_5\) are relevant to explain \(Y\).

Generate \(n=1000\) observations \((x_1,y_1),\dots,(x_n,y_n)\) according to this model. Put these observations in a data.frame df with dimension \(1000\times 106\).

n <- 1000
p <- 105
set.seed(1234)
X.mat <- matrix(rnorm(n*p),ncol=p)
eps <- rnorm(n,mean = 0,sd=0.5)
df <- data.frame(X.mat,eps)
df <- df %>% mutate(Y=X1+X2+X3+X4+X5+eps) %>% select(-eps)

Fit a linear model (lm function) with df and print the estimator of \(\beta_0,\dots,\beta_{105}\).

mod.full <- lm(Y~.,data=df)
summary(mod.full)

## 
## Call:
## lm(formula = Y ~ ., data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.67736 -0.33472  0.00032  0.30319  1.45542 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0130727  0.0166020  -0.787  0.43124    
## X1           0.9846185  0.0165621  59.450  < 2e-16 ***
## X2           0.9962524  0.0166838  59.714  < 2e-16 ***
## X3           1.0185854  0.0162804  62.565  < 2e-16 ***
## X4           1.0069154  0.0164305  61.283  < 2e-16 ***
## X5           1.0075293  0.0171871  58.621  < 2e-16 ***
## X6           0.0162413  0.0167867   0.968  0.33355    
## X7          -0.0380549  0.0165760  -2.296  0.02192 *  
## X8          -0.0138780  0.0163996  -0.846  0.39765    
## X9           0.0078099  0.0168906   0.462  0.64392    
## X10         -0.0169844  0.0172212  -0.986  0.32428    
## X11         -0.0219250  0.0165394  -1.326  0.18530    
## X12          0.0035766  0.0164080   0.218  0.82750    
## X13         -0.0173715  0.0165630  -1.049  0.29455    
## X14          0.0015961  0.0161666   0.099  0.92138    
## X15          0.0030420  0.0165937   0.183  0.85459    
## X16          0.0123968  0.0167407   0.741  0.45918    
## X17         -0.0140972  0.0160033  -0.881  0.37861    
## X18          0.0047780  0.0157696   0.303  0.76197    
## X19          0.0089718  0.0160524   0.559  0.57636    
## X20          0.0120790  0.0166445   0.726  0.46821    
## X21         -0.0052803  0.0168198  -0.314  0.75364    
## X22          0.0055084  0.0169444   0.325  0.74519    
## X23         -0.0119207  0.0165276  -0.721  0.47094    
## X24         -0.0279466  0.0168081  -1.663  0.09673 .  
## X25          0.0067445  0.0161983   0.416  0.67724    
## X26          0.0115625  0.0164784   0.702  0.48306    
## X27         -0.0031926  0.0162123  -0.197  0.84393    
## X28         -0.0158039  0.0167512  -0.943  0.34571    
## X29         -0.0393262  0.0158546  -2.480  0.01331 *  
## X30         -0.0131434  0.0171851  -0.765  0.44458    
## X31         -0.0187328  0.0162188  -1.155  0.24840    
## X32         -0.0442897  0.0164643  -2.690  0.00728 ** 
## X33         -0.0335800  0.0160712  -2.089  0.03695 *  
## X34          0.0087326  0.0158260   0.552  0.58123    
## X35          0.0089432  0.0169050   0.529  0.59692    
## X36          0.0031839  0.0163345   0.195  0.84550    
## X37         -0.0044967  0.0170532  -0.264  0.79208    
## X38         -0.0108821  0.0158859  -0.685  0.49351    
## X39         -0.0135691  0.0166509  -0.815  0.41534    
## X40         -0.0060755  0.0165664  -0.367  0.71390    
## X41         -0.0066924  0.0162678  -0.411  0.68089    
## X42         -0.0117674  0.0155517  -0.757  0.44945    
## X43         -0.0198761  0.0165470  -1.201  0.22999    
## X44          0.0036242  0.0165999   0.218  0.82723    
## X45          0.0105387  0.0161889   0.651  0.51522    
## X46         -0.0404653  0.0165887  -2.439  0.01491 *  
## X47         -0.0300850  0.0168022  -1.791  0.07371 .  
## X48          0.0124983  0.0163363   0.765  0.44444    
## X49          0.0044487  0.0166952   0.266  0.78994    
## X50          0.0240242  0.0164747   1.458  0.14512    
## X51         -0.0048970  0.0163609  -0.299  0.76477    
## X52          0.0307387  0.0166920   1.842  0.06588 .  
## X53         -0.0051005  0.0169236  -0.301  0.76319    
## X54          0.0151990  0.0163935   0.927  0.35411    
## X55         -0.0354083  0.0167166  -2.118  0.03444 *  
## X56         -0.0222455  0.0160800  -1.383  0.16688    
## X57          0.0014254  0.0168241   0.085  0.93250    
## X58          0.0073491  0.0164866   0.446  0.65588    
## X59         -0.0160027  0.0160585  -0.997  0.31926    
## X60          0.0160967  0.0173806   0.926  0.35463    
## X61         -0.0166780  0.0168870  -0.988  0.32360    
## X62          0.0141593  0.0167828   0.844  0.39908    
## X63          0.0002043  0.0164197   0.012  0.99008    
## X64          0.0275355  0.0160570   1.715  0.08672 .  
## X65          0.0048604  0.0163827   0.297  0.76678    
## X66          0.0168259  0.0161561   1.041  0.29795    
## X67         -0.0121348  0.0169109  -0.718  0.47321    
## X68         -0.0003431  0.0161142  -0.021  0.98302    
## X69          0.0419336  0.0160549   2.612  0.00916 ** 
## X70          0.0238172  0.0160990   1.479  0.13938    
## X71          0.0119947  0.0168301   0.713  0.47622    
## X72          0.0030503  0.0168465   0.181  0.85636    
## X73         -0.0104551  0.0160765  -0.650  0.51564    
## X74         -0.0410657  0.0171878  -2.389  0.01709 *  
## X75         -0.0170613  0.0160844  -1.061  0.28910    
## X76          0.0257386  0.0166885   1.542  0.12336    
## X77          0.0101337  0.0168627   0.601  0.54802    
## X78         -0.0168289  0.0163639  -1.028  0.30403    
## X79         -0.0074790  0.0162544  -0.460  0.64554    
## X80          0.0133683  0.0167342   0.799  0.42458    
## X81         -0.0058840  0.0174241  -0.338  0.73568    
## X82          0.0253755  0.0166387   1.525  0.12759    
## X83          0.0288952  0.0164084   1.761  0.07858 .  
## X84         -0.0296538  0.0161642  -1.835  0.06691 .  
## X85          0.0122871  0.0168487   0.729  0.46603    
## X86          0.0169482  0.0161737   1.048  0.29497    
## X87         -0.0026240  0.0166393  -0.158  0.87473    
## X88         -0.0172478  0.0170835  -1.010  0.31295    
## X89         -0.0396334  0.0167748  -2.363  0.01836 *  
## X90         -0.0181130  0.0160121  -1.131  0.25827    
## X91         -0.0075442  0.0163882  -0.460  0.64538    
## X92         -0.0238149  0.0162008  -1.470  0.14192    
## X93          0.0270102  0.0166067   1.626  0.10420    
## X94         -0.0104033  0.0158777  -0.655  0.51250    
## X95         -0.0134528  0.0168760  -0.797  0.42557    
## X96         -0.0209762  0.0164563  -1.275  0.20276    
## X97          0.0199452  0.0160869   1.240  0.21536    
## X98          0.0308105  0.0167190   1.843  0.06568 .  
## X99         -0.0049707  0.0167238  -0.297  0.76636    
## X100         0.0077319  0.0161819   0.478  0.63290    
## X101         0.0116617  0.0166005   0.702  0.48255    
## X102         0.0002618  0.0171176   0.015  0.98780    
## X103         0.0199205  0.0167189   1.191  0.23378    
## X104         0.0143133  0.0165195   0.866  0.38647    
## X105        -0.0166893  0.0169788  -0.983  0.32590    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4932 on 894 degrees of freedom
## Multiple R-squared:  0.9592, Adjusted R-squared:  0.9544 
## F-statistic: 199.9 on 105 and 894 DF,  p-value: < 2.2e-16

We propose to make a variable selection procedure with a backward selection approach using BIC criterion. You can use nvmax=30 in regsubsets.

To obtain the selective variables, we can look at the following graph

m.back1 <- regsubsets(Y~.,data=df,method="backward",nvmax=30)
plot(m.back1,scale="bic")

We can also do that automatically with

a <- summary(m.back1)
number <- order(a$bic)[1]
var.sel <- a$which[number,][-1]
var.sel1 <- names(var.sel)[var.sel] %>% paste(collapse="+")
form.back <- formula(paste("Y~",var.sel1,sep=""))
mod.back <- lm(form.back,data=df)
summary(mod.back)

## 
## Call:
## lm(formula = form.back, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.63923 -0.34301  0.00179  0.32041  1.45661 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.002413   0.015749  -0.153  0.87828    
## X1           0.992339   0.015807  62.777  < 2e-16 ***
## X2           0.991358   0.016097  61.588  < 2e-16 ***
## X3           1.010115   0.015562  64.907  < 2e-16 ***
## X4           1.006043   0.015830  63.552  < 2e-16 ***
## X5           1.008520   0.016242  62.093  < 2e-16 ***
## X29         -0.043358   0.015158  -2.860  0.00432 ** 
## X69          0.042714   0.015292   2.793  0.00532 ** 
## X74         -0.043792   0.016118  -2.717  0.00670 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4969 on 991 degrees of freedom
## Multiple R-squared:  0.954,  Adjusted R-squared:  0.9537 
## F-statistic:  2571 on 8 and 991 DF,  p-value: < 2.2e-16

We have selected a model with 8 variables (the 5 relevant variables and 3 noise variables).

Generate a test dataset (df.test) \((x_{n+1},y_{n+1}),\dots,(x_{n+m},y_{n+m})\) with \(m=500\) according to the same model. Compute the estimated mean square error \[\frac{1}{m}\sum_{i\in test}(\widehat y_i-y_i)^2\] for the two models (full and backward).

m <- 300
p <- 105
set.seed(12345)
X.mat <- matrix(rnorm(m*p),ncol=p)
eps <- rnorm(m,mean = 0,sd=0.5)
df.test <- data.frame(X.mat,eps)
df.test <- df.test %>% mutate(Y=X1+X2+X3+X4+X5+eps) %>% select(-eps)

p.full <- predict(mod.full,newdata=df.test)
p.step <- predict(mod.back,newdata=df.test)
pred.df <- data.frame(full=p.full,step=p.step,obs=df.test$Y)

pred.df %>% summarise_at(1:2,~mean((obs-.)^2))

We observe that the selected model has smaller MSE than the full model. We conclude that this model is better for this criterion. We have seen that it is due to the variance of least square estimates: when there are noisy variables in a model, variance of least square estimate increases. These estimates are thus less accurate. To circumvent this problem, we have to select variables (as we do here with the regsubset function) or to use regularized methods such as Lasso and Ridge.

Penalized regression

Exercise 3 (Lasso and ridge for ozone data)

We consider the same problem as in exercise 1. We propose to fit ridge ans lasso estimates and to compare these models with those of exercise 1.

glmnet function requires the \(X\) matrix and the \(Y\) vector as inputs (we can not use a formula). To compute the \(X\) matrix, we use the function model.matrix:

train.X <- model.matrix(V4~.,data=train)
test.X <- model.matrix(V4~.,data=test)

Draw the coefficient paths for ridge and lasso.

mod.R <- glmnet(train.X,train$V4,alpha=0)
mod.L <- glmnet(train.X,train$V4,alpha=1)

plot(mod.R)

plot(mod.L)

plot(mod.R,xvar="lambda")

plot(mod.L,xvar="lambda")

Select the shrinkage parameter for lasso regression with cv.glmnet.

lassoCV <- cv.glmnet(train.X,train$V4,alpha=1)
plot(lassoCV)

lassoCV$lambda.min

## [1] 0.1356943

Do the same for ridge regression. Explain the problem and solve it.

ridgeCV <- cv.glmnet(train.X,train$V4,alpha=0)
plot(ridgeCV)

ridgeCV$lambda

##   [1] 6153.5728042 5606.9063567 5108.8042497 4654.9521610 4241.4190410
##   [6] 3864.6230635 3521.3006022 3208.4779621 2923.4456231 2663.7347716
##  [11] 2427.0959162 2211.4794046 2015.0176696 1836.0090536 1672.9030696
##  [16] 1524.2869718 1388.8735184 1265.4898231 1153.0671951 1050.6318836
##  [21]  957.2966428  872.2530475  794.7644909  724.1598041  659.8274430
##  [26]  601.2101916  547.8003352  499.1352633  454.7934622  414.3908645
##  [31]  377.5775222  344.0345758  313.4714924  285.6235490  260.2495401
##  [36]  237.1296883  216.0637403  196.8692332  179.3799131  163.4442960
##  [41]  148.9243552  135.6943260  123.6396161  112.6558134  102.6477814
##  [46]   93.5288354   85.2199915   77.6492823   70.7511341   64.4657983
##  [51]   58.7388345   53.5206383   48.7660123   44.4337742   40.4864002
##  [56]   36.8897000   33.6125207   30.6264770   27.9057052   25.4266393
##  [61]   23.1678067   21.1096426   19.2343201   17.5255962   15.9686705
##  [66]   14.5500578   13.2574708   12.0797136   11.0065852   10.0287905
##  [71]    9.1378604    8.3260780    7.5864122    6.9124563    6.2983727
##  [76]    5.7388426    5.2290197    4.7644880    4.3412240    3.9555616
##  [81]    3.6041604    3.2839767    2.9922373    2.7264152    2.4842080
##  [86]    2.2635178    2.0624331    1.8792122    1.7122682    1.5601550
##  [91]    1.4215551    1.2952681    1.1802001    1.0753543    0.9798228
##  [96]    0.8927781    0.8134661    0.7412000    0.6753539    0.6153573

Here the selected parameter correspond to the minimum value over the grid. We need to change the grid:

ridgeCV <- cv.glmnet(train.X,train$V4,alpha=0,lambda=exp(seq(-4,4,length=100)))
plot(ridgeCV)

It’s better!

ridgeCV$lambda.min

## [1] 0.9604013

Estimate the quadratic error for the selected ridge and lasso models. Compare this error with the tree other models:

prev1 <- prev %>% mutate(ridge=as.vector(predict(ridgeCV,newx=test.X,s="lambda.min")),lasso=as.vector(predict(lassoCV,newx=test.X,s="lambda.min")))
prev1 %>% summarize(Err_lin=mean((Y-lin)^2),Err_BIC=mean((Y-BIC)^2),Err_Cp=mean((Y-Cp)^2),Err_ridge=mean((Y-ridge)^2),Err_lasso=mean((Y-lasso)^2))

prev1 %>% gather(key="Method",value="Pred",-Y) %>% group_by(Method) %>%
  summarize(Err=mean((Pred-Y)^2)) %>% arrange(Err)

Conclude.

Ridge and lasso are not interesting for this dataset. It is not surprising since we are not in high dimension (only 9 input variables).

Exercise 4

We consider the spam dataset from the kernlab package:

data(spam)

We split the data into a training dataset of size 3000 and a test dataset of size 1601.

set.seed(1234)
perm <- sample(nrow(spam))
train <- spam[perm[1:3000],]
test <- spam[perm[3001:4601],]
train.X <- model.matrix(type~.,data=train)
test.X <- model.matrix(type~.,data=test)

Fit a logistic model on the train dataset with all the variables.

full.logit <- glm(type~.,data=train,family="binomial")

We propose to make a variable selection procedure with a backward selection approach using BIC criterion (this approach will be described in details in other lectures). You just have to use the step function with the direction=“backward” and k=log(nrow(train)) options. We call it mod.back (it may takes few minutes).

mod.back <- step(full.logit,direction="backward",k=log(nrow(train)),trace=0)

Fit a logistic lasso model on the training data (select the shrinkage parameter with cv.glmnet).

set.seed(1234)
cv.lasso <- cv.glmnet(train.X,train[,58],family="binomial",alpha=1)
plot(cv.lasso)

Fit a logistic ridge model on the training data (select the shrinkage parameter with cv.glmnet).

set.seed(1234)
cv.ridge <- cv.glmnet(train.X,train[,58],family="binomial",alpha=0)
plot(cv.ridge)

We have to change the grid: default values are too large.

set.seed(1234)
cv.ridge <- cv.glmnet(train.X,train[,58],family="binomial",alpha=0,lambda=exp(seq(-15,-2,length=100)))
plot(cv.ridge)

Fit a nearest neighbor rule on the training data (use caret to select the number of nearest neigbor by 10 folds cross validation).

set.seed(1234)
ctrl1 <- trainControl(method="cv",number=10)
grid.k <- data.frame(k=seq(1,20,by=1))
registerDoMC(cores = 5)
sel.k <- train(type~.,data=train,method="knn",trControl=ctrl1,tuneGrid=grid.k)
plot(sel.k)

We choose the 1-NN rule.

Make a comparison of the 5 methods with the error probability (estimated on the test dataset).

prev.full <- predict(full.logit,newdata=test,type="response") %>% round() %>% as.factor()
prev.full <- fct_recode(prev.full,nonspam="0",spam="1")
prev.back <- predict(mod.back,newdata=test,type="response") %>% round() %>% as.factor()
prev.back <- fct_recode(prev.back,nonspam="0",spam="1")

prev.lasso <- predict(cv.lasso,newx=test.X,type="class",s="lambda.min")
prev.ridge <- predict(cv.ridge,newx=test.X,type="class",s="lambda.min")
prev.knn <- knn(train.X,test.X,train$type,k=1)
prev <- data.frame(full=prev.full,back=prev.back,lasso=as.vector(prev.lasso),ridge=as.vector(prev.ridge),knn=prev.knn,Y=test$type)
prev %>% summarise_at(vars(1:5),~(mean((.!=Y)^2)))

Make a comparison of the 5 methods with ROC curve and AUC (estimated on the test dataset).

prev.full <- predict(full.logit,newdata=test,type="response")
prev.back <- predict(mod.back,newdata=test,type="response")
prev.lasso <- predict(cv.lasso,newx=test.X,type="response") %>% as.vector()
prev.ridge <- predict(cv.ridge,newx=test.X,type="response") %>% as.vector()
prev.knn <- knn(train.X,test.X,train$type,k=1,prob=TRUE)
score.knn <- attributes(prev.knn)$prob
score <- data.frame(full=prev.full,back=prev.back,lasso=prev.lasso,ridge=prev.ridge,knn=score.knn,Y=test$type)
score1 <- score %>% gather(key="Method",value="score",-Y)
score1 %>% group_by(Method) %>% summarize(AUC=pROC::auc(Y,score)) %>% arrange(desc(AUC))

ggplot(score1)+aes(d=Y,m=score,color=Method)+geom_roc()+theme_classic()