We need the following packages.
library(tidyverse)
library(plotROC)
library(pROC)
library(shiny)
library(caret)
library(class)
library(KernSmooth) #package for locpoly
We consider the following data set
n <- 1000
seuil <- 0.25
set.seed(1234)
X1 <- runif(n)
set.seed(5678)
X2 <- runif(n)
set.seed(9012)
R1 <- X1<=0.25
R2 <- (X1>0.25 & X2>=0.75)
R3 <- (X1>0.25 & X2<0.75)
Y <- rep(0,n)
Y[R1] <- rbinom(sum(R1),1,0.25)
Y[R2] <- rbinom(sum(R2),1,0.25)
Y[R3] <- rbinom(sum(R3),1,0.75)
my_data <- data.frame(X1,X2,Y)
my_data$Y <- as.factor(my_data$Y)
The problem is to explain \(Y\) by \(X_1\) and \(X_2\).
We can observe that \(X\) has a uniform distribution on the square \([0,1]^2\). \(Y|X=x\) has a Bernoulli distribution with parameter
We deduce that the Bayes rule is given by
\[g^\star(x)=\left\{ \begin{array}{ll} 0 & \text{if }x_1\leq 0.25 \text{ or if }(x_1>0.25\text{ and }x_2\geq 0.75)\\ 1 & \text{otherwise.} \end{array}\right.\]
Bayes error equals \(L^\star=0.25\).
ggplot(my_data)+aes(x=X1,y=X2,color=Y)+geom_point()+theme_classic()
g1 <- as.numeric(my_data$X1>my_data$X2)
g2 <- as.numeric(my_data$X2<0.5)
g3 <- as.numeric(my_data$X1>0.25)
prev <- data.frame(g1=g1,g2=g2,g3=g3,Y=my_data$Y)
We compute \[\frac{1}{n}\sum_{i=1}^n1_{g_j(X_i)\neq Y_i}.\]
prev %>% summarise_at(1:3,~mean(Y!=.))
where \(Y(x)\) stands for the label of the nearest neighbor of \(x\) among \(\{X_1,\dots,X_n\}\). Split the data into
set.seed(1234)
perm <- sample(nrow(my_data))
train <- my_data %>% slice(perm[1:750])
test <- my_data %>% slice(perm[751:1000])
We denote by \(\widehat g_1\) the 1NN rule defined on the training set. We estimate its error probability by : \[\frac{1}{n_{test}}\sum_{i\in test}1_{\widehat g_1(X_i)\neq Y_i}.\]
pred_1NN <- knn(train[,1:2],test[,1:2],train$Y,k=1)
df <- data.frame(obs=test$Y,pred=pred_1NN)
df %>% summarize(Err=mean((obs!=pred)))
We consider three scores \(S_1,S_2\) and \(S_3\) for 100 individuals. We have also at hand the labels of each individuals in the vector \(Y\). Score and labels are in the following data frame:
set.seed(1234)
S1 <- runif(100)
S2 <- runif(100)
S3 <- S1
S3[sample(100,25)] <- runif(25)
Y <- rep(0,n)
Y[S1>0.5] <- 1
df <- data.frame(S1,S2,S3,Y=as.factor(Y))
df1 <- df %>% gather(key="Score",value="value",-Y)
ggplot(df1)+aes(x=value,y=Score,color=Y)+geom_point()+theme_classic()
plot(roc(df$Y,df$S1))
plot(roc(df$Y,df$S1))
plot(roc(df$Y,df$S2),add=TRUE,col="red")
plot(roc(df$Y,df$S3),add=TRUE,col="blue")
auc(df$Y,df$S1)
## Area under the curve: 1
auc(df$Y,df$S2)
## Area under the curve: 0.5519
auc(df$Y,df$S3)
## Area under the curve: 0.9111
df1 <- df %>% gather(key="Score",value="value",-Y)
df1$Y <- df1$Y %>% as.numeric()-1
ggplot(df1)+aes(d=Y,m=value,color=Score)+geom_roc()+theme_classic()
df1 %>% group_by(Score) %>% summarize(AUC=auc(Y,value)) %>% arrange(desc(AUC))
We consider the same dataset as in exercise 1. The goal is to compute the ROC curve for two logistic models.
logit1 <- glm(Y~.,data=train,family=binomial)
We consider the score function \(S_1(x)=\beta_0+\beta_1x_1+\beta_2x_2\). Compute the score of each indivuals in the test dataset (use predict).
prev1 <- predict(logit1,newdata=test,type="response")
logit2 <- glm(Y~X1+X2+I(X1^2)+I(X2^2),data=train,family=binomial)
prev2 <- predict(logit2,newdata=test,type="response")
df <- data.frame(S1=prev1,S2=prev2,obs=as.numeric(test$Y)-1)
With pROC package:
plot(roc(df$obs,df$S1))
plot(roc(df$obs,df$S2),add=TRUE,col="red")
With geom_roc (from plotROC package):
ggplot(df)+aes(d=obs,m=S1)+geom_roc()+theme_classic()
We have to use gather function to visualize the two roc curve on the same graph.
df1 <- df %>% gather(key="Score",value="value",-obs)
ggplot(df1)+aes(d=obs,m=value,color=Score)+geom_roc()+theme_classic()
df1 %>% group_by(Score) %>% summarize(AUC=auc(obs,value)) %>% arrange(desc(AUC))
We compute AUC for the 2 scores:
df1 %>% group_by(Score) %>% summarize(AUC=auc(obs,value)) %>%
arrange(desc(AUC))
We consider the following sample \((X_1,Y_1),\dots,(X_n,Y_n)\) generated according to the model
\[Y_i=\sin(X_i)+\varepsilon_i,\quad i=1,\dots,n\] where \(X_i\sim\mathcal U_{[-2\pi,2\pi]}\) and \(\varepsilon_i\sim\mathcal N(0,0.2^4)\)
n <- 500
set.seed(1234)
X <- runif(n,-2*pi,2*pi)
set.seed(5678)
eps <- rnorm(n,0,0.2)
Y <- sin(X)+eps
df <- data.frame(X=X,Y=Y)
x <- seq(-2*pi,2*pi,by=0.01)
df1 <- data.frame(x=x,y=sin(x))
g1 <- ggplot(df)+aes(x=X,y=Y)+geom_point(size=1)+
geom_line(data=df1,aes(x=x,y=y),color="black",size=1)+theme_classic()
g1
The following commands allow to draw the kernel estimate with bandwidth \(h_1=0.5\):
set.seed(1234)
perm <- sample(500,300)
train <- df %>% slice(perm)
test <- df %>% slice(-perm)
h1 <- 0.5
fx1 <-locpoly(train$X,train$Y,bandwidth=h1)
df2 <- data.frame(x=fx1$x,y=fx1$y)
g1+geom_line(data=df2,aes(x=x,y=y),color="red",size=1)
h2 <- 3;h3 <- 0.01
fx2 <-locpoly(train$X,train$Y,bandwidth=h2)
fx3 <-locpoly(train$X,train$Y,bandwidth=h3)
df3 <- data.frame(x=fx1$x,H1=fx1$y,H2=fx2$y,H3=fx3$y) %>%
gather(key="Band",value="value",-x)
g1+geom_line(data=df3,aes(x=x,y=value,color=Band),size=1)
test.ord <- test %>% arrange(X)
prev1 <- ksmooth(train$X,train$Y,bandwidth=h1,x.points=test.ord$X)
prev2 <- ksmooth(train$X,train$Y,bandwidth=h2,x.points=test.ord$X)
prev3 <- ksmooth(train$X,train$Y,bandwidth=h3,x.points=test.ord$X)
df5 <- data.frame(H1=prev1$y,H2=prev2$y,H3=prev3$y,obs=test.ord$Y) %>%
summarise_all(funs(mean((.-obs)^2,na.rm = TRUE))) %>% select(-obs)
#or (better)
data.frame(H1=prev1$y,H2=prev2$y,H3=prev3$y,obs=test.ord$Y) %>%
summarise_at(vars(H1:H3),~(mean((.-obs)^2,na.rm = TRUE)))
We again consider the dataset of exercise 1.
dim(my_data)
## [1] 1000 3
set.seed(123)
perm <- sample(nrow(my_data))
train <- my_data %>% slice(perm[1:750])
test <- my_data %>% slice(perm[751:1000])
dim(train)
## [1] 750 3
dim(test)
## [1] 250 3
The goal is to find the best integer \(k\) for the \(k\)-nearest neighbor rule.
knn3 <- knn(train[,1:2],test[,1:2],cl=train$Y,k=3)
mean(knn3!=test$Y)
## [1] 0.4
k.cand <- seq(1,450,by=2)
error <- rep(0,length(k.cand))
set.seed(123)
compt <- 1
for (i in 1:length(k.cand)){
prev <- knn(train[,1:2],test[,1:2],cl=train$Y,k=k.cand[i])
error[compt] <- mean(prev!=test$Y)
compt <- compt+1
}
kopt <- k.cand[which.min(error)]
kopt
## [1] 31
df <- data.frame(k=1:450,Error=error)
ggplot(data=df)+aes(x=k,y=Error)+geom_line()+theme_classic()
Run the shiny web application in the file overfitting_app.R. Explain the results.
We propose to use caret package to select the best \(k\). We can look at http://topepo.github.io/caret/index.html for a presentation of the package. Explain outputs of the commands
#ctrl1 <- trainControl(method="LGOCV",number=1,index=list(1:1500))
ctrl1 <- trainControl(method="LGOCV",number=1)
grid.k <- data.frame(k=seq(1,450,by=2))
sel.k <- train(Y~.,data=my_data,method="knn",trControl=ctrl1,tuneGrid=grid.k)
sel.k
## k-Nearest Neighbors
##
## 1000 samples
## 2 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Repeated Train/Test Splits Estimated (1 reps, 75%)
## Summary of sample sizes: 751
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.6104418 0.2206447
## 3 0.6425703 0.2875422
## 5 0.6706827 0.3415672
## 7 0.6666667 0.3298421
## 9 0.6746988 0.3451836
## 11 0.6706827 0.3391378
## 13 0.6907631 0.3798156
## 15 0.6907631 0.3790524
## 17 0.6947791 0.3874935
## 19 0.7068273 0.4105840
## 21 0.7028112 0.4036121
## 23 0.6947791 0.3874935
## 25 0.7028112 0.4036121
## 27 0.7068273 0.4105840
## 29 0.7068273 0.4098568
## 31 0.7068273 0.4098568
## 33 0.7028112 0.4021415
## 35 0.6987952 0.3936885
## 37 0.6987952 0.3936885
## 39 0.6987952 0.3936885
## 41 0.7028112 0.4014034
## 43 0.7028112 0.4014034
## 45 0.7028112 0.4014034
## 47 0.6987952 0.3944356
## 49 0.6987952 0.3944356
## 51 0.6987952 0.3936885
## 53 0.7028112 0.4021415
## 55 0.6947791 0.3859831
## 57 0.6987952 0.3936885
## 59 0.6907631 0.3767513
## 61 0.6947791 0.3852251
## 63 0.6907631 0.3775202
## 65 0.6907631 0.3775202
## 67 0.6867470 0.3690468
## 69 0.6867470 0.3690468
## 71 0.6827309 0.3605630
## 73 0.6907631 0.3775202
## 75 0.6867470 0.3690468
## 77 0.6827309 0.3605630
## 79 0.6827309 0.3605630
## 81 0.6867470 0.3690468
## 83 0.6867470 0.3690468
## 85 0.6907631 0.3775202
## 87 0.6827309 0.3605630
## 89 0.6867470 0.3682670
## 91 0.6907631 0.3775202
## 93 0.6907631 0.3767513
## 95 0.6947791 0.3852251
## 97 0.6907631 0.3767513
## 99 0.6987952 0.3929396
## 101 0.6987952 0.3929396
## 103 0.6987952 0.3929396
## 105 0.6947791 0.3852251
## 107 0.6947791 0.3844653
## 109 0.7028112 0.4006635
## 111 0.7028112 0.4014034
## 113 0.6987952 0.3929396
## 115 0.7028112 0.4006635
## 117 0.7028112 0.4006635
## 119 0.7108434 0.4168618
## 121 0.7108434 0.4168618
## 123 0.7108434 0.4168618
## 125 0.7108434 0.4168618
## 127 0.7108434 0.4168618
## 129 0.7068273 0.4091278
## 131 0.7028112 0.4006635
## 133 0.7108434 0.4161402
## 135 0.7108434 0.4161402
## 137 0.7108434 0.4161402
## 139 0.7068273 0.4076645
## 141 0.7108434 0.4161402
## 143 0.6987952 0.3914361
## 145 0.7108434 0.4161402
## 147 0.7148594 0.4238929
## 149 0.7068273 0.4069301
## 151 0.7108434 0.4146915
## 153 0.7148594 0.4231786
## 155 0.7108434 0.4161402
## 157 0.7148594 0.4231786
## 159 0.7108434 0.4146915
## 161 0.7148594 0.4224625
## 163 0.7068273 0.4054558
## 165 0.7068273 0.4054558
## 167 0.7028112 0.3976857
## 169 0.6947791 0.3814069
## 171 0.6947791 0.3814069
## 173 0.6947791 0.3814069
## 175 0.6827309 0.3557884
## 177 0.6827309 0.3549857
## 179 0.6907631 0.3713152
## 181 0.6867470 0.3627535
## 183 0.6867470 0.3611604
## 185 0.6907631 0.3689563
## 187 0.6947791 0.3775410
## 189 0.6867470 0.3603610
## 191 0.6827309 0.3517548
## 193 0.6827309 0.3517548
## 195 0.6907631 0.3681661
## 197 0.6867470 0.3587559
## 199 0.6867470 0.3587559
## 201 0.6867470 0.3587559
## 203 0.6787149 0.3414876
## 205 0.6706827 0.3250248
## 207 0.6706827 0.3250248
## 209 0.6746988 0.3336747
## 211 0.6706827 0.3258716
## 213 0.6787149 0.3423138
## 215 0.6787149 0.3423138
## 217 0.6787149 0.3423138
## 219 0.6787149 0.3414876
## 221 0.6787149 0.3414876
## 223 0.6787149 0.3414876
## 225 0.6787149 0.3406593
## 227 0.6746988 0.3311557
## 229 0.6746988 0.3311557
## 231 0.6666667 0.3146410
## 233 0.6586345 0.2963529
## 235 0.6626506 0.3041916
## 237 0.6546185 0.2876248
## 239 0.6546185 0.2867231
## 241 0.6506024 0.2779722
## 243 0.6506024 0.2788855
## 245 0.6506024 0.2788855
## 247 0.6465863 0.2692102
## 249 0.6506024 0.2779722
## 251 0.6506024 0.2779722
## 253 0.6465863 0.2692102
## 255 0.6425703 0.2604372
## 257 0.6506024 0.2779722
## 259 0.6345382 0.2438178
## 261 0.6265060 0.2262171
## 263 0.6265060 0.2262171
## 265 0.6265060 0.2262171
## 267 0.6265060 0.2252334
## 269 0.6265060 0.2252334
## 271 0.6265060 0.2242471
## 273 0.6305221 0.2330767
## 275 0.6265060 0.2242471
## 277 0.6305221 0.2320998
## 279 0.6305221 0.2320998
## 281 0.6224900 0.2144056
## 283 0.6305221 0.2311204
## 285 0.6265060 0.2222670
## 287 0.6265060 0.2222670
## 289 0.6224900 0.2134023
## 291 0.6224900 0.2134023
## 293 0.6224900 0.2134023
## 295 0.6224900 0.2134023
## 297 0.6224900 0.2134023
## 299 0.6224900 0.2134023
## 301 0.6224900 0.2123965
## 303 0.6224900 0.2113881
## 305 0.6184739 0.2024881
## 307 0.6184739 0.2024881
## 309 0.6144578 0.1946092
## 311 0.6104418 0.1856984
## 313 0.6104418 0.1856984
## 315 0.6144578 0.1935767
## 317 0.6184739 0.2024881
## 319 0.6184739 0.2024881
## 321 0.6144578 0.1935767
## 323 0.6104418 0.1846538
## 325 0.6144578 0.1935767
## 327 0.6144578 0.1935767
## 329 0.6104418 0.1846538
## 331 0.6104418 0.1846538
## 333 0.6104418 0.1846538
## 335 0.6064257 0.1767762
## 337 0.6104418 0.1846538
## 339 0.6104418 0.1846538
## 341 0.6064257 0.1757195
## 343 0.6064257 0.1757195
## 345 0.6064257 0.1757195
## 347 0.6064257 0.1757195
## 349 0.6064257 0.1757195
## 351 0.6064257 0.1757195
## 353 0.6024096 0.1678426
## 355 0.6024096 0.1678426
## 357 0.6024096 0.1678426
## 359 0.6024096 0.1678426
## 361 0.6064257 0.1757195
## 363 0.6064257 0.1757195
## 365 0.6024096 0.1667737
## 367 0.6024096 0.1667737
## 369 0.6024096 0.1667737
## 371 0.6024096 0.1667737
## 373 0.6024096 0.1667737
## 375 0.6024096 0.1667737
## 377 0.5983936 0.1578164
## 379 0.6024096 0.1657021
## 381 0.6024096 0.1657021
## 383 0.6024096 0.1657021
## 385 0.6024096 0.1657021
## 387 0.6024096 0.1657021
## 389 0.6024096 0.1657021
## 391 0.5983936 0.1578164
## 393 0.5983936 0.1578164
## 395 0.6024096 0.1657021
## 397 0.5983936 0.1567326
## 399 0.5983936 0.1567326
## 401 0.6024096 0.1657021
## 403 0.6024096 0.1657021
## 405 0.5983936 0.1567326
## 407 0.5983936 0.1567326
## 409 0.5983936 0.1567326
## 411 0.6024096 0.1657021
## 413 0.6024096 0.1646277
## 415 0.6024096 0.1646277
## 417 0.6024096 0.1646277
## 419 0.6024096 0.1646277
## 421 0.6024096 0.1646277
## 423 0.6024096 0.1646277
## 425 0.5983936 0.1556460
## 427 0.5983936 0.1556460
## 429 0.6024096 0.1635506
## 431 0.6024096 0.1635506
## 433 0.6024096 0.1635506
## 435 0.6024096 0.1635506
## 437 0.6024096 0.1635506
## 439 0.6024096 0.1635506
## 441 0.6024096 0.1635506
## 443 0.6024096 0.1635506
## 445 0.6024096 0.1635506
## 447 0.6024096 0.1635506
## 449 0.6024096 0.1635506
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 161.
plot(sel.k)
We estimate \[P(\widehat g_k(X)= Y)\] (column Accuracy) of the \(k\)nn rules by validation hold out (75% in train, 25% in test).
We can also specify explicitely train and test sets:
ctrl11 <- trainControl(method="LGOCV",number=1,index=list(perm[1:750]))
sel.k1 <- train(Y~.,data=my_data,method="knn",trControl=ctrl11,tuneGrid=grid.k)
sel.k1
## k-Nearest Neighbors
##
## 1000 samples
## 2 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Repeated Train/Test Splits Estimated (1 reps, 75%)
## Summary of sample sizes: 750
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.616 0.2326384
## 3 0.600 0.1898898
## 5 0.680 0.3475140
## 7 0.680 0.3508180
## 9 0.688 0.3638262
## 11 0.700 0.3888128
## 13 0.716 0.4194414
## 15 0.720 0.4271311
## 17 0.700 0.3877751
## 19 0.720 0.4290748
## 21 0.720 0.4310053
## 23 0.720 0.4310053
## 25 0.720 0.4310053
## 27 0.712 0.4147483
## 29 0.720 0.4300417
## 31 0.732 0.4540061
## 33 0.724 0.4367531
## 35 0.728 0.4453869
## 37 0.720 0.4300417
## 39 0.720 0.4310053
## 41 0.720 0.4310053
## 43 0.724 0.4386593
## 45 0.720 0.4300417
## 47 0.720 0.4300417
## 49 0.716 0.4223885
## 51 0.716 0.4233643
## 53 0.724 0.4386593
## 55 0.724 0.4386593
## 57 0.728 0.4481953
## 59 0.724 0.4396076
## 61 0.724 0.4396076
## 63 0.724 0.4396076
## 65 0.724 0.4396076
## 67 0.716 0.4233643
## 69 0.720 0.4310053
## 71 0.720 0.4310053
## 73 0.712 0.4137572
## 75 0.716 0.4223885
## 77 0.720 0.4310053
## 79 0.716 0.4223885
## 81 0.716 0.4223885
## 83 0.716 0.4223885
## 85 0.716 0.4223885
## 87 0.720 0.4310053
## 89 0.720 0.4310053
## 91 0.716 0.4223885
## 93 0.720 0.4300417
## 95 0.712 0.4137572
## 97 0.704 0.3984914
## 99 0.704 0.3984914
## 101 0.704 0.3984914
## 103 0.704 0.3984914
## 105 0.716 0.4214095
## 107 0.720 0.4290748
## 109 0.720 0.4290748
## 111 0.720 0.4290748
## 113 0.712 0.4137572
## 115 0.712 0.4127626
## 117 0.720 0.4300417
## 119 0.708 0.4071210
## 121 0.708 0.4071210
## 123 0.708 0.4061178
## 125 0.700 0.3908778
## 127 0.704 0.3984914
## 129 0.708 0.4061178
## 131 0.712 0.4137572
## 133 0.704 0.3984914
## 135 0.704 0.3984914
## 137 0.704 0.3984914
## 139 0.708 0.4061178
## 141 0.700 0.3888128
## 143 0.700 0.3877751
## 145 0.708 0.4041011
## 147 0.712 0.4127626
## 149 0.712 0.4127626
## 151 0.716 0.4223885
## 153 0.708 0.4051112
## 155 0.704 0.3964505
## 157 0.700 0.3877751
## 159 0.692 0.3714491
## 161 0.688 0.3627451
## 163 0.688 0.3616603
## 165 0.688 0.3627451
## 167 0.684 0.3540263
## 169 0.684 0.3540263
## 171 0.680 0.3452927
## 173 0.684 0.3540263
## 175 0.684 0.3540263
## 177 0.676 0.3365441
## 179 0.680 0.3452927
## 181 0.684 0.3529258
## 183 0.680 0.3441763
## 185 0.672 0.3266325
## 187 0.680 0.3441763
## 189 0.676 0.3342758
## 191 0.664 0.3090287
## 193 0.656 0.2925770
## 195 0.672 0.3243243
## 197 0.660 0.2990038
## 199 0.660 0.2990038
## 201 0.660 0.2990038
## 203 0.664 0.3066561
## 205 0.656 0.2901479
## 207 0.652 0.2812768
## 209 0.652 0.2800397
## 211 0.652 0.2800397
## 213 0.656 0.2889271
## 215 0.648 0.2711370
## 217 0.652 0.2787983
## 219 0.648 0.2711370
## 221 0.652 0.2775526
## 223 0.652 0.2787983
## 225 0.652 0.2775526
## 227 0.648 0.2686170
## 229 0.648 0.2686170
## 231 0.652 0.2763027
## 233 0.656 0.2840016
## 235 0.656 0.2840016
## 237 0.656 0.2840016
## 239 0.652 0.2750483
## 241 0.652 0.2750483
## 243 0.652 0.2750483
## 245 0.656 0.2827595
## 247 0.664 0.2982222
## 249 0.672 0.3137386
## 251 0.668 0.3059736
## 253 0.668 0.3047645
## 255 0.668 0.3035511
## 257 0.668 0.3035511
## 259 0.672 0.3125419
## 261 0.672 0.3125419
## 263 0.660 0.2855222
## 265 0.660 0.2855222
## 267 0.664 0.2933100
## 269 0.660 0.2855222
## 271 0.664 0.2933100
## 273 0.656 0.2752157
## 275 0.656 0.2752157
## 277 0.648 0.2570579
## 279 0.636 0.2297014
## 281 0.636 0.2283427
## 283 0.624 0.2021999
## 285 0.620 0.1930003
## 287 0.624 0.2007890
## 289 0.620 0.1915719
## 291 0.620 0.1915719
## 293 0.624 0.2007890
## 295 0.624 0.2007890
## 297 0.624 0.2007890
## 299 0.624 0.2007890
## 301 0.624 0.2007890
## 303 0.620 0.1915719
## 305 0.620 0.1915719
## 307 0.624 0.2007890
## 309 0.624 0.2007890
## 311 0.624 0.2007890
## 313 0.624 0.2007890
## 315 0.624 0.2007890
## 317 0.624 0.2007890
## 319 0.624 0.2007890
## 321 0.624 0.2007890
## 323 0.620 0.1915719
## 325 0.624 0.2007890
## 327 0.620 0.1915719
## 329 0.620 0.1915719
## 331 0.620 0.1915719
## 333 0.620 0.1915719
## 335 0.620 0.1915719
## 337 0.620 0.1915719
## 339 0.616 0.1823385
## 341 0.616 0.1823385
## 343 0.616 0.1823385
## 345 0.616 0.1823385
## 347 0.616 0.1823385
## 349 0.612 0.1730887
## 351 0.612 0.1730887
## 353 0.612 0.1730887
## 355 0.612 0.1730887
## 357 0.612 0.1730887
## 359 0.616 0.1823385
## 361 0.616 0.1823385
## 363 0.616 0.1823385
## 365 0.620 0.1915719
## 367 0.616 0.1823385
## 369 0.612 0.1730887
## 371 0.616 0.1823385
## 373 0.616 0.1823385
## 375 0.612 0.1730887
## 377 0.612 0.1730887
## 379 0.608 0.1638225
## 381 0.612 0.1730887
## 383 0.616 0.1823385
## 385 0.612 0.1730887
## 387 0.612 0.1730887
## 389 0.612 0.1730887
## 391 0.612 0.1730887
## 393 0.612 0.1730887
## 395 0.612 0.1730887
## 397 0.608 0.1638225
## 399 0.604 0.1545399
## 401 0.600 0.1452407
## 403 0.608 0.1638225
## 405 0.604 0.1545399
## 407 0.604 0.1545399
## 409 0.608 0.1638225
## 411 0.608 0.1638225
## 413 0.608 0.1638225
## 415 0.608 0.1638225
## 417 0.600 0.1452407
## 419 0.596 0.1359250
## 421 0.596 0.1374599
## 423 0.596 0.1374599
## 425 0.600 0.1467577
## 427 0.600 0.1467577
## 429 0.596 0.1374599
## 431 0.600 0.1467577
## 433 0.600 0.1467577
## 435 0.604 0.1545399
## 437 0.600 0.1467577
## 439 0.600 0.1467577
## 441 0.604 0.1545399
## 443 0.604 0.1545399
## 445 0.608 0.1638225
## 447 0.600 0.1467577
## 449 0.600 0.1467577
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 31.
We obtain the same result as in question 2.
ctrl2 <- trainControl(method="LGOCV",number=1,p=1/2)
sel2.k <- train(Y~.,data=my_data,method="knn",trControl=ctrl2,tuneGrid=grid.k)
#or
ctrl3 <- trainControl(method="LGOCV",number=1,index=list(1:500))
sel3.k <- train(Y~.,data=my_data,method="knn",trControl=ctrl3,tuneGrid=grid.k)
plot(sel3.k)
We just have to change parameters method in trainControl.
ctrl4 <- trainControl(method="cv",number=10)
sel4.k <- train(Y~.,data=my_data,method="knn",trControl=ctrl4,tuneGrid=grid.k)
Remark: We can use the doMC package to parallelize cross-validation:
library(doMC)
detectCores()
## [1] 8
ctrl5 <- trainControl(method="cv",number=50)
registerDoMC(cores = 1)
system.time(sel5.k <- train(Y~.,data=my_data,method="knn",trControl=ctrl5,tuneGrid=grid.k))
## user system elapsed
## 77.371 0.223 77.753
registerDoMC(cores = 5)
system.time(sel5.k <- train(Y~.,data=my_data,method="knn",trControl=ctrl5,tuneGrid=grid.k))
## user system elapsed
## 115.254 1.457 26.707
Calculations are parallelized on 5 cores. We clearly reduce computation time (if the computer possesses many cores).
data1 <- my_data
names(data1)[3] <- c("Class")
levels(data1$Class) <- c("G0","G1")
ctrl11 <- trainControl(method="LGOCV",number=1,index=list(1:750),classProbs=TRUE,summary=twoClassSummary,p=0.66)
aa <- train(Class~.,data=data1,method="knn",trControl=ctrl11,metric="ROC",tuneGrid=grid.k)
aa
## k-Nearest Neighbors
##
## 1000 samples
## 2 predictor
## 2 classes: 'G0', 'G1'
##
## No pre-processing
## Resampling: Repeated Train/Test Splits Estimated (1 reps, 66%)
## Summary of sample sizes: 750
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 1 0.5908043 0.5409836 0.6406250
## 3 0.6620453 0.5983607 0.6562500
## 5 0.7029329 0.6229508 0.7031250
## 7 0.7084721 0.6475410 0.7578125
## 9 0.6990907 0.6311475 0.7578125
## 11 0.6978099 0.6557377 0.7500000
## 13 0.7022605 0.6475410 0.7656250
## 15 0.7257300 0.6475410 0.7578125
## 17 0.7295082 0.6557377 0.7578125
## 19 0.7283555 0.6885246 0.7421875
## 21 0.7144275 0.6639344 0.7578125
## 23 0.7163806 0.6803279 0.7500000
## 25 0.7209273 0.6639344 0.7656250
## 27 0.7230405 0.6639344 0.7500000
## 29 0.7196785 0.6557377 0.7656250
## 31 0.7170210 0.6557377 0.7578125
## 33 0.7157723 0.6721311 0.7500000
## 35 0.7166368 0.6639344 0.7578125
## 37 0.7229124 0.6639344 0.7656250
## 39 0.7278432 0.6639344 0.7421875
## 41 0.7267546 0.6721311 0.7500000
## 43 0.7305008 0.6721311 0.7578125
## 45 0.7281634 0.6721311 0.7656250
## 47 0.7323899 0.6803279 0.7578125
## 49 0.7317495 0.6803279 0.7500000
## 51 0.7272989 0.6803279 0.7656250
## 53 0.7280353 0.6803279 0.7734375
## 55 0.7289959 0.6803279 0.7734375
## 57 0.7311091 0.6803279 0.7734375
## 59 0.7339267 0.6803279 0.7812500
## 61 0.7341509 0.6721311 0.7812500
## 63 0.7340228 0.6803279 0.7812500
## 65 0.7377049 0.6803279 0.7812500
## 67 0.7386335 0.6721311 0.7734375
## 69 0.7400102 0.6721311 0.7734375
## 71 0.7388256 0.6639344 0.7734375
## 73 0.7387935 0.6639344 0.7734375
## 75 0.7405546 0.6721311 0.7890625
## 77 0.7417392 0.6721311 0.7890625
## 79 0.7384093 0.6557377 0.7812500
## 81 0.7377690 0.6639344 0.7734375
## 83 0.7383133 0.6639344 0.7812500
## 85 0.7395940 0.6639344 0.7812500
## 87 0.7402984 0.6639344 0.7812500
## 89 0.7423156 0.6475410 0.7968750
## 91 0.7410028 0.6639344 0.7968750
## 93 0.7438204 0.6639344 0.8046875
## 95 0.7467661 0.6639344 0.8046875
## 97 0.7464780 0.6639344 0.8046875
## 99 0.7461258 0.6475410 0.8046875
## 101 0.7437564 0.6557377 0.8046875
## 103 0.7454854 0.6475410 0.8046875
## 105 0.7451332 0.6475410 0.8046875
## 107 0.7461578 0.6475410 0.8046875
## 109 0.7450692 0.6557377 0.7968750
## 111 0.7487193 0.6475410 0.7968750
## 113 0.7498079 0.6475410 0.7968750
## 115 0.7464460 0.6475410 0.7890625
## 117 0.7481429 0.6475410 0.7890625
## 119 0.7468942 0.6475410 0.7890625
## 121 0.7452293 0.6475410 0.7812500
## 123 0.7406506 0.6393443 0.7890625
## 125 0.7390497 0.6393443 0.7968750
## 127 0.7359439 0.6475410 0.7968750
## 129 0.7351114 0.6475410 0.7968750
## 131 0.7344070 0.6475410 0.7968750
## 133 0.7358478 0.6393443 0.7968750
## 135 0.7346952 0.6475410 0.7812500
## 137 0.7375768 0.6311475 0.7812500
## 139 0.7364562 0.6311475 0.7812500
## 141 0.7379290 0.6311475 0.7968750
## 143 0.7366803 0.6229508 0.7890625
## 145 0.7356878 0.6229508 0.7890625
## 147 0.7331263 0.6311475 0.7812500
## 149 0.7346311 0.6147541 0.7812500
## 151 0.7350794 0.6147541 0.7890625
## 153 0.7355277 0.6065574 0.7890625
## 155 0.7369685 0.6065574 0.7968750
## 157 0.7372246 0.6147541 0.7812500
## 159 0.7367764 0.6229508 0.7968750
## 161 0.7358478 0.6229508 0.7968750
## 163 0.7356237 0.6147541 0.8125000
## 165 0.7338307 0.6147541 0.7968750
## 167 0.7352395 0.5819672 0.8203125
## 169 0.7354316 0.5819672 0.8203125
## 171 0.7338627 0.5655738 0.8125000
## 173 0.7339588 0.5655738 0.8203125
## 175 0.7334785 0.5655738 0.8203125
## 177 0.7308210 0.5409836 0.8125000
## 179 0.7293481 0.5327869 0.8203125
## 181 0.7282275 0.5245902 0.8125000
## 183 0.7266265 0.5327869 0.8203125
## 185 0.7282595 0.5245902 0.8125000
## 187 0.7265625 0.5245902 0.8203125
## 189 0.7230405 0.5163934 0.8203125
## 191 0.7220159 0.5163934 0.8203125
## 193 0.7199347 0.5081967 0.8203125
## 195 0.7188140 0.5000000 0.8046875
## 197 0.7191022 0.4836066 0.8125000
## 199 0.7198386 0.4754098 0.8125000
## 201 0.7209593 0.4836066 0.8125000
## 203 0.7206071 0.4836066 0.8125000
## 205 0.7212474 0.4754098 0.8203125
## 207 0.7213435 0.4754098 0.8125000
## 209 0.7186219 0.4754098 0.8125000
## 211 0.7183338 0.4672131 0.8046875
## 213 0.7179816 0.4426230 0.8046875
## 215 0.7156122 0.4344262 0.8046875
## 217 0.7160284 0.4344262 0.8046875
## 219 0.7161885 0.4180328 0.8046875
## 221 0.7160284 0.4098361 0.8125000
## 223 0.7150999 0.4180328 0.8125000
## 225 0.7143955 0.4016393 0.8125000
## 227 0.7139793 0.4016393 0.8125000
## 229 0.7134029 0.4016393 0.8125000
## 231 0.7153881 0.3934426 0.8125000
## 233 0.7144915 0.3852459 0.8125000
## 235 0.7151639 0.3852459 0.8125000
## 237 0.7137871 0.3934426 0.8046875
## 239 0.7143315 0.4016393 0.8046875
## 241 0.7128586 0.3852459 0.8046875
## 243 0.7126665 0.3934426 0.8046875
## 245 0.7133709 0.4016393 0.8125000
## 247 0.7123143 0.4098361 0.8125000
## 249 0.7115459 0.4016393 0.8125000
## 251 0.7125064 0.3852459 0.8203125
## 253 0.7123143 0.3852459 0.8281250
## 255 0.7132748 0.3852459 0.8281250
## 257 0.7117700 0.3770492 0.8281250
## 259 0.7119301 0.3524590 0.8281250
## 261 0.7110015 0.3606557 0.8281250
## 263 0.7107454 0.3524590 0.8281250
## 265 0.7082159 0.3524590 0.8359375
## 267 0.7093366 0.3524590 0.8359375
## 269 0.7080238 0.3524590 0.8281250
## 271 0.7094326 0.3524590 0.8281250
## 273 0.7074475 0.3524590 0.8281250
## 275 0.7061027 0.3524590 0.8203125
## 277 0.7056865 0.3442623 0.8203125
## 279 0.7071273 0.3442623 0.8281250
## 281 0.7078317 0.3524590 0.8281250
## 283 0.7082480 0.3442623 0.8281250
## 285 0.7086002 0.3442623 0.8281250
## 287 0.7090804 0.3278689 0.8281250
## 289 0.7076076 0.3278689 0.8359375
## 291 0.7072554 0.3196721 0.8359375
## 293 0.7084401 0.3114754 0.8359375
## 295 0.7065830 0.3114754 0.8359375
## 297 0.7071913 0.3114754 0.8359375
## 299 0.7067111 0.3114754 0.8359375
## 301 0.7070633 0.3114754 0.8359375
## 303 0.7079598 0.3114754 0.8359375
## 305 0.7065190 0.3114754 0.8359375
## 307 0.7068712 0.3114754 0.8359375
## 309 0.7071593 0.3114754 0.8437500
## 311 0.7076716 0.3114754 0.8437500
## 313 0.7076716 0.3114754 0.8437500
## 315 0.7070953 0.3278689 0.8359375
## 317 0.7070953 0.3278689 0.8281250
## 319 0.7065510 0.3278689 0.8359375
## 321 0.7060707 0.3278689 0.8359375
## 323 0.7047259 0.3278689 0.8281250
## 325 0.7040215 0.3278689 0.8281250
## 327 0.7014921 0.3360656 0.8281250
## 329 0.7011078 0.3360656 0.8281250
## 331 0.7010758 0.3278689 0.8281250
## 333 0.7019083 0.3278689 0.8359375
## 335 0.7020044 0.3196721 0.8359375
## 337 0.7017482 0.3196721 0.8359375
## 339 0.7012359 0.3196721 0.8359375
## 341 0.7007236 0.3114754 0.8359375
## 343 0.7021644 0.3032787 0.8359375
## 345 0.7023886 0.3032787 0.8359375
## 347 0.7021965 0.3032787 0.8359375
## 349 0.7026767 0.3032787 0.8359375
## 351 0.7022605 0.3032787 0.8359375
## 353 0.7026447 0.2950820 0.8359375
## 355 0.7034452 0.3032787 0.8359375
## 357 0.7014280 0.3032787 0.8359375
## 359 0.7021644 0.2950820 0.8359375
## 361 0.7007556 0.2950820 0.8437500
## 363 0.7007236 0.2868852 0.8515625
## 365 0.7002433 0.2868852 0.8515625
## 367 0.7007556 0.2868852 0.8437500
## 369 0.7018443 0.2868852 0.8437500
## 371 0.7001473 0.2868852 0.8437500
## 373 0.6984503 0.2868852 0.8437500
## 375 0.6994749 0.2868852 0.8437500
## 377 0.6985464 0.2868852 0.8437500
## 379 0.6971055 0.2868852 0.8359375
## 381 0.6981942 0.2868852 0.8359375
## 383 0.6974257 0.2868852 0.8437500
## 385 0.6968814 0.2868852 0.8437500
## 387 0.6979700 0.2868852 0.8437500
## 389 0.6992188 0.2868852 0.8437500
## 391 0.6995069 0.2868852 0.8515625
## 393 0.6990266 0.2868852 0.8515625
## 395 0.6997631 0.2868852 0.8593750
## 397 0.7005635 0.2868852 0.8593750
## 399 0.6989946 0.2868852 0.8593750
## 401 0.6978420 0.2950820 0.8593750
## 403 0.6979700 0.2868852 0.8671875
## 405 0.6988025 0.2786885 0.8671875
## 407 0.6994109 0.2786885 0.8671875
## 409 0.7004034 0.2622951 0.8671875
## 411 0.7004995 0.2704918 0.8671875
## 413 0.6987705 0.2704918 0.8593750
## 415 0.6980341 0.2704918 0.8671875
## 417 0.6954086 0.2704918 0.8671875
## 419 0.6935515 0.2622951 0.8593750
## 421 0.6954086 0.2704918 0.8593750
## 423 0.6954726 0.2704918 0.8671875
## 425 0.6939997 0.2540984 0.8671875
## 427 0.6927510 0.2459016 0.8671875
## 429 0.6920146 0.2459016 0.8593750
## 431 0.6917905 0.2377049 0.8671875
## 433 0.6917585 0.2377049 0.8671875
## 435 0.6915343 0.2377049 0.8671875
## 437 0.6917905 0.2377049 0.8671875
## 439 0.6926550 0.2377049 0.8671875
## 441 0.6923988 0.2377049 0.8671875
## 443 0.6919826 0.2295082 0.8671875
## 445 0.6925269 0.2377049 0.8671875
## 447 0.6912782 0.2377049 0.8671875
## 449 0.6915343 0.2377049 0.8671875
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 113.
getTrainPerf(aa)
We consider AUC instead of the error probability (change of criterion).