library(tidyverse)
On considère le jeu de données Carseats du package ISLR
library(ISLR)
data(Carseats)
Le problème est d’expliquer la variable continue Sales par les autres variables. On pourra trouver un descriptif des variables avec :
help(Carseats)
library(rpart)
library(rpart.plot)
tree <- rpart(Sales~.,data=Carseats)
rpart.plot(tree)
On peut également utiliser visTree (package visNetwork) pour obtenir une viuslisation dynamique de l’arbre
library(visNetwork)
visTree(tree)
Une application shiny est également proposée dans ce package :
visTreeEditor(Carseats)
printcp(tree)
Regression tree:
rpart(formula = Sales ~ ., data = Carseats)
Variables actually used in tree construction:
[1] Advertising Age CompPrice Income Population Price
[7] ShelveLoc
Root node error: 3182.3/400 = 7.9557
n= 400
CP nsplit rel error xerror xstd
1 0.250510 0 1.00000 1.00615 0.069611
2 0.105073 1 0.74949 0.75846 0.051854
3 0.051121 2 0.64442 0.66117 0.045043
4 0.045671 3 0.59330 0.64788 0.044809
5 0.033592 4 0.54763 0.60548 0.042902
6 0.024063 5 0.51403 0.59616 0.042790
7 0.023948 6 0.48997 0.61594 0.042821
8 0.022163 7 0.46602 0.61594 0.042821
9 0.016043 8 0.44386 0.58359 0.041977
10 0.014027 9 0.42782 0.58996 0.041083
11 0.013145 11 0.39976 0.58616 0.039102
12 0.012711 12 0.38662 0.58039 0.039014
13 0.012147 13 0.37391 0.58373 0.040509
14 0.011888 14 0.36176 0.57832 0.040428
15 0.010778 15 0.34987 0.56209 0.038606
16 0.010506 16 0.33909 0.55863 0.038231
17 0.010000 17 0.32859 0.55515 0.037591
On obtient des informations sur la suite d’arbres emboîtés qui optimise le critère cout/complexité
:
L’approche classique consiste à choisir l’arbre qui a la plus petite erreur de prévision (colonne xerror). On remarque ici que l’erreur de prévision est décroissante, elle ne remonte pas au bout d’un certain moment. Il est donc possible que la suite d’abres ne soit pas assez grande.
On construit une sous-suite plus grande en modifiant les paramètres cp et minsplit :
tree1 <- rpart(Sales~.,data=Carseats,cp=0.00001,minsplit=2)
printcp(tree1)
Regression tree:
rpart(formula = Sales ~ ., data = Carseats, cp = 1e-05, minsplit = 2)
Variables actually used in tree construction:
[1] Advertising Age CompPrice Education Income Population
[7] Price ShelveLoc Urban US
Root node error: 3182.3/400 = 7.9557
n= 400
CP nsplit rel error xerror xstd
1 2.5051e-01 0 1.00000000 1.00291 0.069404
2 1.0507e-01 1 0.74948961 0.75712 0.051578
3 5.1121e-02 2 0.64441706 0.69151 0.047005
4 4.5671e-02 3 0.59329646 0.68494 0.047818
5 3.3592e-02 4 0.54762521 0.63089 0.043799
6 2.4063e-02 5 0.51403284 0.59586 0.040847
7 2.3948e-02 6 0.48997005 0.60704 0.041098
8 2.2163e-02 7 0.46602225 0.60359 0.040219
9 1.6043e-02 8 0.44385897 0.59711 0.041520
10 1.4027e-02 9 0.42781645 0.59800 0.040097
11 1.3145e-02 11 0.39976237 0.60553 0.039983
12 1.2711e-02 12 0.38661699 0.60372 0.040083
13 1.2147e-02 13 0.37390609 0.59499 0.039630
14 1.1888e-02 14 0.36175900 0.59501 0.039596
15 1.0778e-02 15 0.34987122 0.59872 0.039997
16 1.0506e-02 16 0.33909277 0.59808 0.039869
17 1.0301e-02 17 0.32858663 0.61208 0.040840
18 9.8052e-03 18 0.31828518 0.61243 0.041850
19 9.5324e-03 20 0.29867475 0.61452 0.042323
20 9.3098e-03 21 0.28914234 0.61337 0.042270
21 8.6039e-03 22 0.27983257 0.62966 0.042877
22 8.5728e-03 23 0.27122871 0.63401 0.042597
23 7.7737e-03 25 0.25408305 0.62308 0.041816
24 7.4353e-03 26 0.24630936 0.61296 0.041396
25 6.2838e-03 28 0.23143882 0.61657 0.040090
26 6.1242e-03 29 0.22515504 0.60740 0.039826
27 5.6953e-03 30 0.21903085 0.60385 0.039753
28 5.5687e-03 31 0.21333555 0.60897 0.040007
29 5.4134e-03 32 0.20776686 0.60729 0.040051
30 5.1373e-03 33 0.20235343 0.60269 0.039312
31 4.9581e-03 34 0.19721608 0.59573 0.038481
32 4.8270e-03 35 0.19225798 0.58846 0.038180
33 4.5558e-03 36 0.18743102 0.58848 0.038361
34 4.5456e-03 37 0.18287525 0.60276 0.038908
35 4.3739e-03 38 0.17832965 0.59951 0.038879
36 4.3307e-03 39 0.17395578 0.61251 0.043734
37 4.2485e-03 40 0.16962503 0.61401 0.043851
38 4.0980e-03 41 0.16537650 0.60555 0.043654
39 4.0525e-03 42 0.16127847 0.60091 0.043411
40 4.0054e-03 43 0.15722596 0.60037 0.043307
41 3.6917e-03 44 0.15322052 0.60364 0.043309
42 3.6352e-03 45 0.14952883 0.60289 0.043054
43 3.5301e-03 46 0.14589367 0.60237 0.043070
44 3.5196e-03 47 0.14236356 0.59975 0.043111
45 2.8653e-03 48 0.13884396 0.59928 0.043212
46 2.8565e-03 49 0.13597868 0.60643 0.043865
47 2.8565e-03 50 0.13312217 0.60643 0.043865
48 2.7253e-03 51 0.13026571 0.60586 0.043878
49 2.6841e-03 52 0.12754044 0.60505 0.043872
50 2.6829e-03 54 0.12217220 0.60694 0.044076
51 2.6660e-03 55 0.11948928 0.60961 0.044117
52 2.4588e-03 56 0.11682326 0.60625 0.044136
53 2.3693e-03 57 0.11436443 0.60113 0.044269
54 2.3018e-03 58 0.11199508 0.59687 0.044450
55 2.2746e-03 60 0.10739152 0.60136 0.044742
56 2.2540e-03 61 0.10511688 0.60191 0.044740
57 2.1781e-03 62 0.10286290 0.60048 0.044721
58 2.1645e-03 63 0.10068483 0.60718 0.044985
59 2.0950e-03 64 0.09852033 0.60604 0.044971
60 2.0945e-03 65 0.09642538 0.60889 0.045046
61 2.0740e-03 66 0.09433084 0.61122 0.045104
62 1.8864e-03 67 0.09225680 0.61784 0.045361
63 1.8413e-03 68 0.09037038 0.62699 0.046843
64 1.7921e-03 69 0.08852905 0.62473 0.046864
65 1.7167e-03 70 0.08673697 0.62249 0.046893
66 1.6766e-03 71 0.08502031 0.62214 0.046901
67 1.6704e-03 72 0.08334367 0.62293 0.046889
68 1.6064e-03 73 0.08167332 0.62646 0.047130
69 1.6055e-03 74 0.08006697 0.62736 0.047112
70 1.5103e-03 75 0.07846149 0.62853 0.047272
71 1.4967e-03 76 0.07695120 0.63041 0.047443
72 1.4907e-03 77 0.07545453 0.62971 0.047264
73 1.4007e-03 78 0.07396387 0.62773 0.047244
74 1.4002e-03 79 0.07256317 0.62827 0.047523
75 1.3613e-03 80 0.07116301 0.63139 0.047825
76 1.3589e-03 81 0.06980172 0.63184 0.047815
77 1.3462e-03 82 0.06844282 0.63184 0.047815
78 1.3351e-03 83 0.06709659 0.63198 0.047706
79 1.3304e-03 84 0.06576144 0.63290 0.047706
80 1.3146e-03 85 0.06443102 0.63122 0.047411
81 1.2795e-03 86 0.06311644 0.62646 0.047086
82 1.2412e-03 87 0.06183696 0.62358 0.046390
83 1.2373e-03 88 0.06059575 0.62183 0.046634
84 1.2135e-03 89 0.05935843 0.62424 0.046753
85 1.2002e-03 91 0.05693148 0.62948 0.047203
86 1.1269e-03 92 0.05573126 0.62958 0.047214
87 1.0919e-03 93 0.05460435 0.63687 0.047237
88 1.0898e-03 94 0.05351243 0.63554 0.047249
89 1.0864e-03 95 0.05242260 0.63532 0.047255
90 1.0646e-03 96 0.05133621 0.63314 0.047003
91 1.0116e-03 97 0.05027156 0.63306 0.047076
92 9.5940e-04 98 0.04925996 0.63774 0.047547
93 8.9105e-04 99 0.04830056 0.63716 0.047380
94 8.8465e-04 100 0.04740951 0.63773 0.047367
95 8.7611e-04 101 0.04652486 0.64000 0.047364
96 8.5644e-04 102 0.04564875 0.64280 0.047391
97 8.4568e-04 103 0.04479231 0.64291 0.047464
98 8.3004e-04 104 0.04394663 0.64368 0.047476
99 8.0748e-04 105 0.04311659 0.64447 0.047462
100 7.9944e-04 106 0.04230912 0.64275 0.047319
101 7.5680e-04 107 0.04150968 0.64276 0.047279
102 7.4082e-04 108 0.04075288 0.63971 0.047063
103 7.4043e-04 109 0.04001206 0.63952 0.047047
104 7.3510e-04 110 0.03927163 0.63952 0.047047
105 7.0107e-04 111 0.03853653 0.64072 0.046975
106 6.9184e-04 112 0.03783546 0.63802 0.046794
107 6.7585e-04 113 0.03714362 0.63670 0.046722
108 6.7373e-04 114 0.03646776 0.63751 0.046682
109 6.7173e-04 115 0.03579403 0.63751 0.046682
110 6.6783e-04 116 0.03512230 0.63847 0.046672
111 6.6518e-04 117 0.03445448 0.63847 0.046672
112 6.6451e-04 118 0.03378929 0.63847 0.046672
113 6.0900e-04 119 0.03312478 0.63703 0.046289
114 6.0343e-04 120 0.03251578 0.63514 0.046325
115 5.9465e-04 121 0.03191235 0.63600 0.046306
116 5.8550e-04 123 0.03072304 0.63894 0.046451
117 5.8340e-04 124 0.03013754 0.63913 0.046449
118 5.6972e-04 125 0.02955414 0.63935 0.046529
119 5.6433e-04 126 0.02898442 0.63835 0.046483
120 5.6323e-04 127 0.02842009 0.63835 0.046483
121 5.4821e-04 128 0.02785686 0.63630 0.046425
122 5.4339e-04 131 0.02621222 0.63730 0.046456
123 5.1968e-04 132 0.02566882 0.63575 0.046452
124 5.0869e-04 133 0.02514915 0.63277 0.046393
125 5.0157e-04 134 0.02464045 0.63239 0.046287
126 4.7302e-04 135 0.02413889 0.63334 0.046151
127 4.6969e-04 136 0.02366587 0.63334 0.046151
128 4.6775e-04 137 0.02319618 0.63334 0.046151
129 4.6669e-04 138 0.02272842 0.63334 0.046151
130 4.5761e-04 139 0.02226174 0.63236 0.046119
131 4.5283e-04 140 0.02180413 0.63335 0.046118
132 4.5270e-04 141 0.02135130 0.63242 0.046117
133 4.5251e-04 142 0.02089861 0.63242 0.046117
134 4.4875e-04 143 0.02044610 0.63242 0.046117
135 4.4874e-04 144 0.01999735 0.63242 0.046117
136 4.4666e-04 145 0.01954861 0.63242 0.046117
137 4.3805e-04 146 0.01910194 0.63583 0.046189
138 4.2159e-04 147 0.01866389 0.63810 0.046220
139 4.1179e-04 148 0.01824230 0.63780 0.046246
140 3.8646e-04 149 0.01783051 0.63800 0.046305
141 3.6959e-04 150 0.01744404 0.64026 0.046309
142 3.3035e-04 151 0.01707446 0.63823 0.046416
143 3.0799e-04 152 0.01674411 0.64103 0.046497
144 3.0672e-04 153 0.01643612 0.64306 0.046644
145 3.0672e-04 154 0.01612940 0.64306 0.046644
146 3.0672e-04 155 0.01582268 0.64306 0.046644
147 3.0544e-04 156 0.01551596 0.64316 0.046641
148 3.0094e-04 157 0.01521052 0.64503 0.046758
149 2.9757e-04 158 0.01490958 0.64530 0.046753
150 2.8981e-04 159 0.01461201 0.64447 0.046774
151 2.8923e-04 160 0.01432220 0.64394 0.046737
152 2.8782e-04 161 0.01403296 0.64394 0.046737
153 2.8635e-04 162 0.01374515 0.64394 0.046737
154 2.8189e-04 163 0.01345879 0.64504 0.046716
155 2.8173e-04 164 0.01317690 0.64475 0.046716
156 2.6988e-04 165 0.01289517 0.64630 0.046869
157 2.6283e-04 166 0.01262530 0.64570 0.046860
158 2.5737e-04 167 0.01236246 0.64494 0.046893
159 2.5139e-04 168 0.01210509 0.64677 0.047107
160 2.5003e-04 169 0.01185370 0.64483 0.046869
161 2.3771e-04 170 0.01160367 0.64490 0.046866
162 2.3512e-04 171 0.01136596 0.64563 0.046998
163 2.2600e-04 172 0.01113084 0.64464 0.046995
164 2.1796e-04 173 0.01090483 0.64412 0.046982
165 2.1590e-04 174 0.01068688 0.64412 0.046982
166 2.1121e-04 175 0.01047098 0.64429 0.046978
167 2.0973e-04 176 0.01025977 0.64373 0.046951
168 2.0949e-04 178 0.00984031 0.64373 0.046951
169 2.0779e-04 179 0.00963081 0.64373 0.046951
170 2.0120e-04 180 0.00942302 0.64310 0.046941
171 2.0025e-04 181 0.00922182 0.64310 0.046941
172 1.9247e-04 182 0.00902157 0.64342 0.046939
173 1.8668e-04 183 0.00882910 0.64392 0.046939
174 1.7976e-04 184 0.00864242 0.64347 0.046990
175 1.6630e-04 185 0.00846266 0.64246 0.046972
176 1.6596e-04 186 0.00829637 0.64109 0.046997
177 1.6594e-04 187 0.00813041 0.64109 0.046997
178 1.6347e-04 188 0.00796447 0.64109 0.046997
179 1.6290e-04 189 0.00780100 0.64347 0.047046
180 1.5712e-04 190 0.00763810 0.64150 0.046944
181 1.5619e-04 191 0.00748098 0.64141 0.046947
182 1.5210e-04 192 0.00732479 0.64048 0.046946
183 1.4745e-04 193 0.00717270 0.63952 0.047016
184 1.4354e-04 194 0.00702525 0.63910 0.047021
185 1.3883e-04 195 0.00688171 0.63904 0.047020
186 1.3883e-04 196 0.00674288 0.63896 0.046978
187 1.3613e-04 197 0.00660405 0.63896 0.046978
188 1.3589e-04 198 0.00646792 0.63887 0.046981
189 1.3299e-04 199 0.00633203 0.63932 0.046971
190 1.3241e-04 200 0.00619904 0.63843 0.046968
191 1.3011e-04 201 0.00606664 0.63843 0.046968
192 1.2674e-04 202 0.00593652 0.63843 0.046968
193 1.2674e-04 203 0.00580978 0.63822 0.046973
194 1.2167e-04 204 0.00568304 0.63933 0.046967
195 1.2167e-04 205 0.00556136 0.63971 0.047080
196 1.2105e-04 206 0.00543969 0.63971 0.047080
197 1.1352e-04 207 0.00531864 0.63941 0.047083
198 1.0898e-04 208 0.00520512 0.63913 0.047090
199 1.0860e-04 209 0.00509614 0.63957 0.047081
200 1.0592e-04 210 0.00498754 0.63957 0.047081
[ reached getOption("max.print") -- omitted 92 rows ]
plotcp(tree1)
On choisit l’arbre qui a la plus petite erreur de prévision.
cp_opt <- tree1$cptable %>% as.data.frame() %>% dplyr::filter(xerror==min(xerror)) %>% dplyr::select(CP) %>% as.numeric()
opt.tree <- prune(tree,cp=cp_opt)
rpart.plot(opt.tree)
id.new <- sample(nrow(Carseats),10)
new.x <- Carseats %>% slice(id.new) %>% select(-Sales)
Calculer les valeurs de Sales prédites par l’arbre construit.
predict(opt.tree,newdata=new.x)
1 2 3 4 5 6 7 8
5.786500 5.786500 4.986765 6.230000 6.626512 3.767200 9.828889 5.385833
9 10
4.986765 10.730000
On considère le jeu de données spam du package kernlab.
library(kernlab)
data(spam)
set.seed(1234)
spam <- spam[sample(nrow(spam)),]
Le problème est d’expliquer la variable binaire type par les autres.
library(randomForest)
rf1 <- randomForest(type~.,data=spam)
plot(rf1)
Ce graphe permet de visualiser l’erreur de classication ainsi que les taux de faux positifs et faux négatifs calculés par Out Of Bag en fonction du nombre d’arbres de la forêt. Ce graphe peut être utilisé pour voir si l’algorithme a bien “convergé”. Si ce n’est pas le cas, il faut construire une forêt avec plus d’abres.
rf2 <- randomForest(type~.,data=spam,mtry=1)
rf1
Call:
randomForest(formula = type ~ ., data = spam)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 7
OOB estimate of error rate: 4.56%
Confusion matrix:
nonspam spam class.error
nonspam 2711 77 0.02761836
spam 133 1680 0.07335907
rf2
Call:
randomForest(formula = type ~ ., data = spam, mtry = 1)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 1
OOB estimate of error rate: 7.89%
Confusion matrix:
nonspam spam class.error
nonspam 2729 59 0.02116212
spam 304 1509 0.16767788
La forêt rf1
est plus performante en terme d’erreur de classification OOB.
library(caret)
grille.mtry <- data.frame(mtry=seq(1,30,by=5))
ctrl <- trainControl(method="oob")
library(doParallel) ## pour paralléliser
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
set.seed(12345)
sel.mtry <- train(type~.,data=spam,method="rf",trControl=ctrl,tuneGrid=grille.mtry)
on.exit(stopCluster(cl))
On choisit
sel.mtry$bestTune
rf3 <- randomForest(type~.,data=spam,mtry=unlist(sel.mtry$bestTune),importance=TRUE)
Imp <- importance(rf3,type=1) %>% as.data.frame() %>% mutate(variable=names(spam)[-58]) %>% arrange(desc(MeanDecreaseAccuracy))
head(Imp)
ggplot(Imp) + aes(x=reorder(variable,MeanDecreaseAccuracy),y=MeanDecreaseAccuracy)+geom_bar(stat="identity")+coord_flip()+xlab("")+theme_classic()
library(ranger)
system.time(rf4 <- ranger(type~.,data=spam))
user system elapsed
3.825 0.025 0.588
system.time(rf5 <- randomForest(type~.,data=spam))
user system elapsed
8.522 0.078 8.604
Le temps de calcul est plus rapide avec ranger. Ce package permet une implémentation efficace des forêts aléatoires pour des données de grande dimension. on peut touver plus d’information ici.
On considère toujours le jeu de données spam du package kernlab.
library(gbm)
model_ada1 <- gbm(type~.,data=spam,distribution="adaboost",interaction.depth=2,shrinkage=0.05,n.trees=500)
Il est nécessaire que la variable qualitative à expliquer soit codée 0-1 pour adaboost
spam1 <- spam
spam1$type <- as.numeric(spam1$type)-1
set.seed(1234)
model_ada1 <- gbm(type~.,data=spam1,distribution="adaboost",interaction.depth=2,shrinkage=0.05,n.trees=500)
L’algorithme gbm est une descente de gradient qui minimise la fonction de perte \[\frac{1}{n}\sum_{i=1}^n \ell(y_i,g(x_i)).\] Dans le cas de adaboost on utilise la perte exponentielle : \(\ell(y,g(x))=\exp(-yg(x))\).
On effectue un résumé du modèle :
summary(model_ada1)
On obtient un indicateur qui permet de mesurer l’importance des variable dans la construction de la méthode.
model_ada2 <- gbm(type~.,data=spam1,distribution="adaboost",interaction.depth=2,bag.fraction=1,cv.folds = 5,n.trees=500)
gbm.perf(model_ada2)
[1] 233
model_ada3 <- gbm(type~.,data=spam1,distribution="adaboost",interaction.depth=2,bag.fraction=1,cv.folds = 5,n.trees=500,shrinkage=0.05)
gbm.perf(model_ada3)
[1] 370
model_ada4 <- gbm(type~.,data=spam1,distribution="adaboost",interaction.depth=2,bag.fraction=1,cv.folds = 5,n.trees=500,shrinkage=0.5)
gbm.perf(model_ada4)
[1] 36
Le nombre d’itérations optimal augmente lorsque shrinkage diminue. C’est logique car ce dernier paramètre controle la vitesse de descente de gradient : plus il est grand, plus on minimise vite et moins on itère. Il faut néanmoins veiller à ne pas le prendre trop petit pour avoir un estimateur stable. Ici, 0.05 semble être une bonne valeur.
Séparer le jeu de données spam en un échantillon d’apprentissage de taille 3000 et un échantillon test qui comprendra le reste des observations. Sur l’échantillon d’apprentissage uniquement, on constuira une règle de classification et un score en utilisant :
On sépare les données
library(kernlab)
data(spam)
set.seed(123)
ind.app <- sample(nrow(spam),3000)
dapp <- spam %>% slice(ind.app)
dtest <- spam %>% slice(-ind.app)
library(rpart)
library(rpart.plot)
arbre <- rpart(type~.,data=dapp,cp=0.00001,minsplit=3)
plotcp(arbre)
cp_opt <- arbre$cptable[which.min(arbre$cptable[,"xerror"]),"CP"]
arbre_sel <- prune(arbre,cp=cp_opt)
rpart.plot(arbre_sel)
score <- data.frame(arbre=predict(arbre_sel,newdata=dtest,type="prob")[,2])
library(glmnet)
dapp1 <- model.matrix(type~.,data=dapp)[,-1]
Yapp1 <- as.factor(as.numeric(dapp$type)-1)
lasso.cv <- cv.glmnet(dapp1,Yapp1,alpha=1,family="binomial")
plot(lasso.cv)
dtest1 <- model.matrix(type~.,data=dtest)[,-1]
Ytest1 <- as.factor(as.numeric(dtest$type)-1)
score.lasso <- predict(lasso.cv,newx=dtest1,type="response") %>% unlist() %>% as.numeric()
score <- score %>% mutate(lasso=score.lasso)
C <- c(0.001,0.01,1,10,100,1000)
C <- c(1,10)
gr <- expand.grid(C=C)
ctrl <- trainControl(method="cv")
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
set.seed(12345)
svm.lin <- train(type~.,data=dapp,method="svmLinear",trControl=ctrl,tuneGrid=gr,prob.model=TRUE)
on.exit(stopCluster(cl))
C <- c(0.001,0.01,1,100,1000)
sigma <- c(0.05,0.1,0.5,1,5)
gr <- expand.grid(C=C,sigma=sigma)
ctrl <- trainControl(method="cv")
registerDoParallel(cl)
set.seed(12345)
svm.rad <- train(type~.,data=dapp,method="svmRadial",trControl=ctrl,tuneGrid=gr,prob.model=TRUE)
on.exit(stopCluster(cl))
score <- score %>% mutate(svm.lin=predict(svm.lin,newdata=dtest,type="prob")[,2],
svm.rad=predict(svm.rad,newdata=dtest,type="prob")[,2])
library(gbm)
dapp2 <- dapp
dtest2 <- dtest
dapp2$type <- as.numeric(dapp2$type)-1
dtest2$type <- as.numeric(dtest2$type)-1
ada <- gbm(type~.,data=dapp2,distribution="adaboost",interaction.depth=2,shrinkage=0.05,cv.folds=5,bag.fraction=1,n.trees=500)
Mopt.ada <- gbm.perf(ada,meth="cv")
logit <- gbm(type~.,data=dapp2,distribution="bernoulli",interaction.depth=2,shrinkage=0.1,cv.folds=5,bag.fraction=1,n.trees=1000)
Mopt.logit <- gbm.perf(logit,meth="cv")
score <- score %>% mutate(ada=predict(ada,newdata=dtest,n.trees=Mopt.ada,type="response"),
logit=predict(logit,newdata=dtest,n.trees=Mopt.logit,type="response"))
library(randomForest)
foret <- randomForest(type~.,data=dapp,xtest=dtest[,-ncol(dtest)],ytest=dtest[,ncol(dtest)],keep.forest=TRUE)
score <- score %>% mutate(foret=foret$test$vote[,2])
Comparaison des méthodes
On créé une table qui contient toutes les informations pur calculer les critères.
score1 <- score %>% mutate(obs=dtest$type) %>% gather(key="Method",value="Score",-obs) %>%
mutate(Prev=recode(as.character(Score>0.5),"TRUE"="spam","FALSE"="nonspam"))
On en déduit :
score1 %>% group_by(Method) %>% summarise(Err=mean(obs!=Prev)) %>% arrange(Err)
score1 %>% group_by(Method) %>% summarize(AUC=pROC::auc(obs,Score)) %>% arrange(desc(AUC))
library(plotROC)
ggplot(score1)+aes(d=obs,m=Score,color=Method)+geom_roc()+theme_classic()