library(tidyverse)
On considère le jeu de données Carseats du package ISLR
library(ISLR)
data(Carseats)
Le problème est d’expliquer la variable continue Sales par les autres variables. On pourra trouver un descriptif des variables avec :
help(Carseats)
library(rpart)
library(rpart.plot)
tree <- rpart(Sales~.,data=Carseats)
rpart.plot(tree)
On peut également utiliser visTree (package visNetwork) pour obtenir une viuslisation dynamique de l’arbre
library(visNetwork)
visTree(tree)
Une application shiny est également proposée dans ce package :
visTreeEditor(Carseats)
printcp(tree)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = Carseats)
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Income Population Price
## [7] ShelveLoc
##
## Root node error: 3182.3/400 = 7.9557
##
## n= 400
##
## CP nsplit rel error xerror xstd
## 1 0.250510 0 1.00000 1.00505 0.069326
## 2 0.105073 1 0.74949 0.75773 0.051374
## 3 0.051121 2 0.64442 0.69016 0.046600
## 4 0.045671 3 0.59330 0.67712 0.045226
## 5 0.033592 4 0.54763 0.61476 0.041991
## 6 0.024063 5 0.51403 0.62552 0.043198
## 7 0.023948 6 0.48997 0.63188 0.043239
## 8 0.022163 7 0.46602 0.63054 0.043256
## 9 0.016043 8 0.44386 0.59951 0.041214
## 10 0.014027 9 0.42782 0.62885 0.043420
## 11 0.013145 11 0.39976 0.62564 0.042471
## 12 0.012711 12 0.38662 0.61644 0.041528
## 13 0.012147 13 0.37391 0.61544 0.041268
## 14 0.011888 14 0.36176 0.61465 0.041388
## 15 0.010778 15 0.34987 0.60276 0.040118
## 16 0.010506 16 0.33909 0.60728 0.040578
## 17 0.010000 17 0.32859 0.61810 0.040885
On obtient des informations sur la suite d’arbres emboîtés qui optimise le critère cout/complexité
:
L’approche classique consiste à choisir l’arbre qui a la plus petite erreur de prévision (colonne xerror). On remarque ici que l’erreur de prévision est décroissante, elle ne remonte pas au bout d’un certain moment. Il est donc possible que la suite d’abres ne soit pas assez grande.
On construit une sous-suite plus grande en modifiant les paramètres cp et minsplit :
tree1 <- rpart(Sales~.,data=Carseats,cp=0.00001,minsplit=2)
printcp(tree1)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = Carseats, cp = 1e-05, minsplit = 2)
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Education Income
## [6] Population Price ShelveLoc Urban US
##
## Root node error: 3182.3/400 = 7.9557
##
## n= 400
##
## CP nsplit rel error xerror xstd
## 1 2.5051e-01 0 1.00000000 1.00401 0.069191
## 2 1.0507e-01 1 0.74948961 0.75812 0.051300
## 3 5.1121e-02 2 0.64441706 0.69927 0.047503
## 4 4.5671e-02 3 0.59329646 0.69177 0.046061
## 5 3.3592e-02 4 0.54762521 0.63300 0.045341
## 6 2.4063e-02 5 0.51403284 0.63469 0.044226
## 7 2.3948e-02 6 0.48997005 0.65467 0.044366
## 8 2.2163e-02 7 0.46602225 0.65706 0.044439
## 9 1.6043e-02 8 0.44385897 0.61051 0.041345
## 10 1.4027e-02 9 0.42781645 0.62344 0.041749
## 11 1.3145e-02 11 0.39976237 0.63295 0.041738
## 12 1.2711e-02 12 0.38661699 0.63880 0.042122
## 13 1.2147e-02 13 0.37390609 0.63696 0.042022
## 14 1.1888e-02 14 0.36175900 0.63276 0.042037
## 15 1.0778e-02 15 0.34987122 0.62136 0.041480
## 16 1.0506e-02 16 0.33909277 0.60796 0.039893
## 17 1.0301e-02 17 0.32858663 0.61017 0.039952
## 18 9.8052e-03 18 0.31828518 0.60766 0.040052
## 19 9.5324e-03 20 0.29867475 0.60891 0.039849
## 20 9.3098e-03 21 0.28914234 0.61073 0.039705
## 21 8.6039e-03 22 0.27983257 0.61577 0.039964
## 22 8.5728e-03 23 0.27122871 0.62103 0.040722
## 23 7.7737e-03 25 0.25408305 0.62779 0.041268
## 24 7.4353e-03 26 0.24630936 0.62060 0.040763
## 25 6.2838e-03 28 0.23143882 0.59467 0.039994
## 26 6.1242e-03 29 0.22515504 0.57385 0.038433
## 27 5.6953e-03 30 0.21903085 0.56284 0.038214
## 28 5.5687e-03 31 0.21333555 0.56167 0.038061
## 29 5.4134e-03 32 0.20776686 0.55829 0.037690
## 30 5.1373e-03 33 0.20235343 0.55411 0.037761
## 31 4.9581e-03 34 0.19721608 0.54784 0.037146
## 32 4.8270e-03 35 0.19225798 0.55893 0.037567
## 33 4.5558e-03 36 0.18743102 0.55974 0.037917
## 34 4.5456e-03 37 0.18287525 0.56128 0.037941
## 35 4.3739e-03 38 0.17832965 0.55791 0.037944
## 36 4.3307e-03 39 0.17395578 0.55303 0.037415
## 37 4.2485e-03 40 0.16962503 0.55255 0.037373
## 38 4.0980e-03 41 0.16537650 0.55875 0.037655
## 39 4.0525e-03 42 0.16127847 0.55945 0.037643
## 40 4.0054e-03 43 0.15722596 0.55945 0.037643
## 41 3.6917e-03 44 0.15322052 0.55938 0.037240
## 42 3.6352e-03 45 0.14952883 0.55169 0.037187
## 43 3.5301e-03 46 0.14589367 0.54929 0.037165
## 44 3.5196e-03 47 0.14236356 0.55488 0.037809
## 45 2.8653e-03 48 0.13884396 0.55677 0.037639
## 46 2.8565e-03 49 0.13597868 0.58098 0.042587
## 47 2.8565e-03 50 0.13312217 0.58098 0.042587
## 48 2.7253e-03 51 0.13026571 0.59207 0.043434
## 49 2.6841e-03 52 0.12754044 0.59403 0.043782
## 50 2.6829e-03 54 0.12217220 0.59723 0.043956
## 51 2.6660e-03 55 0.11948928 0.59729 0.043954
## 52 2.4588e-03 56 0.11682326 0.60459 0.044466
## 53 2.3693e-03 57 0.11436443 0.60192 0.044332
## 54 2.3018e-03 58 0.11199508 0.60253 0.044858
## 55 2.2746e-03 60 0.10739152 0.60138 0.044871
## 56 2.2540e-03 61 0.10511688 0.60133 0.044872
## 57 2.1781e-03 62 0.10286290 0.60121 0.044886
## 58 2.1645e-03 63 0.10068483 0.60106 0.044931
## 59 2.0950e-03 64 0.09852033 0.59915 0.044843
## 60 2.0945e-03 65 0.09642538 0.59665 0.044816
## 61 2.0740e-03 66 0.09433084 0.59540 0.044835
## 62 1.8864e-03 67 0.09225680 0.59673 0.045072
## 63 1.8413e-03 68 0.09037038 0.60079 0.045077
## 64 1.7921e-03 69 0.08852905 0.60279 0.044732
## 65 1.7167e-03 70 0.08673697 0.60879 0.045028
## 66 1.6766e-03 71 0.08502031 0.60826 0.044886
## 67 1.6704e-03 72 0.08334367 0.60681 0.044842
## 68 1.6064e-03 73 0.08167332 0.60678 0.044984
## 69 1.6055e-03 74 0.08006697 0.60358 0.044854
## 70 1.5103e-03 75 0.07846149 0.59867 0.044409
## 71 1.4967e-03 76 0.07695120 0.60031 0.044289
## 72 1.4907e-03 77 0.07545453 0.60031 0.044289
## 73 1.4007e-03 78 0.07396387 0.60031 0.044469
## 74 1.4002e-03 79 0.07256317 0.60124 0.044630
## 75 1.3613e-03 80 0.07116301 0.60250 0.044667
## 76 1.3589e-03 81 0.06980172 0.59915 0.044687
## 77 1.3462e-03 82 0.06844282 0.59915 0.044687
## 78 1.3351e-03 83 0.06709659 0.59923 0.044639
## 79 1.3304e-03 84 0.06576144 0.59923 0.044639
## 80 1.3146e-03 85 0.06443102 0.59952 0.044649
## 81 1.2795e-03 86 0.06311644 0.60230 0.044681
## 82 1.2412e-03 87 0.06183696 0.60035 0.044255
## 83 1.2373e-03 88 0.06059575 0.60007 0.044199
## 84 1.2135e-03 89 0.05935843 0.60276 0.044350
## 85 1.2002e-03 91 0.05693148 0.60441 0.044365
## 86 1.1269e-03 92 0.05573126 0.60120 0.044175
## 87 1.0919e-03 93 0.05460435 0.60238 0.044083
## 88 1.0898e-03 94 0.05351243 0.60485 0.044255
## 89 1.0864e-03 95 0.05242260 0.60527 0.044246
## 90 1.0646e-03 96 0.05133621 0.60588 0.044232
## 91 1.0116e-03 97 0.05027156 0.60735 0.044310
## 92 9.5940e-04 98 0.04925996 0.60656 0.044315
## 93 8.9105e-04 99 0.04830056 0.60979 0.044430
## 94 8.8465e-04 100 0.04740951 0.61063 0.044466
## 95 8.7611e-04 101 0.04652486 0.60658 0.044383
## 96 8.5644e-04 102 0.04564875 0.60702 0.044385
## 97 8.4568e-04 103 0.04479231 0.60710 0.044383
## 98 8.3004e-04 104 0.04394663 0.60710 0.044383
## 99 8.0748e-04 105 0.04311659 0.60976 0.044501
## 100 7.9944e-04 106 0.04230912 0.60933 0.044472
## 101 7.5680e-04 107 0.04150968 0.60872 0.044427
## 102 7.4082e-04 108 0.04075288 0.60319 0.044363
## 103 7.4043e-04 109 0.04001206 0.60079 0.044226
## 104 7.3510e-04 110 0.03927163 0.60079 0.044226
## 105 7.0107e-04 111 0.03853653 0.59869 0.044134
## 106 6.9184e-04 112 0.03783546 0.60257 0.044019
## 107 6.7585e-04 113 0.03714362 0.60156 0.044130
## 108 6.7373e-04 114 0.03646776 0.60219 0.044139
## 109 6.7173e-04 115 0.03579403 0.60242 0.044132
## 110 6.6783e-04 116 0.03512230 0.60213 0.044140
## 111 6.6518e-04 117 0.03445448 0.60213 0.044140
## 112 6.6451e-04 118 0.03378929 0.60213 0.044140
## 113 6.0900e-04 119 0.03312478 0.60268 0.043905
## 114 6.0343e-04 120 0.03251578 0.60338 0.043983
## 115 5.9465e-04 121 0.03191235 0.60338 0.043983
## 116 5.8550e-04 123 0.03072304 0.60429 0.043932
## 117 5.8340e-04 124 0.03013754 0.60470 0.043933
## 118 5.6972e-04 125 0.02955414 0.60584 0.043993
## 119 5.6433e-04 126 0.02898442 0.61046 0.044229
## 120 5.6323e-04 127 0.02842009 0.61028 0.044224
## 121 5.4821e-04 128 0.02785686 0.60879 0.044226
## 122 5.4339e-04 131 0.02621222 0.60885 0.044225
## 123 5.1968e-04 132 0.02566882 0.60977 0.044244
## 124 5.0869e-04 133 0.02514915 0.61237 0.044469
## 125 5.0157e-04 134 0.02464045 0.61236 0.044475
## 126 4.7302e-04 135 0.02413889 0.60868 0.044486
## 127 4.6969e-04 136 0.02366587 0.60671 0.044323
## 128 4.6775e-04 137 0.02319618 0.60671 0.044323
## 129 4.6669e-04 138 0.02272842 0.60671 0.044323
## 130 4.5761e-04 139 0.02226174 0.60630 0.044333
## 131 4.5283e-04 140 0.02180413 0.60525 0.044304
## 132 4.5270e-04 141 0.02135130 0.60525 0.044304
## 133 4.5251e-04 142 0.02089861 0.60525 0.044304
## 134 4.4875e-04 143 0.02044610 0.60525 0.044304
## 135 4.4874e-04 144 0.01999735 0.60527 0.044304
## 136 4.4666e-04 145 0.01954861 0.60527 0.044304
## 137 4.3805e-04 146 0.01910194 0.60521 0.044306
## 138 4.2159e-04 147 0.01866389 0.60504 0.044317
## 139 4.1179e-04 148 0.01824230 0.60477 0.044327
## 140 3.8646e-04 149 0.01783051 0.61066 0.044547
## 141 3.6959e-04 150 0.01744404 0.61374 0.044701
## 142 3.3035e-04 151 0.01707446 0.61269 0.044643
## 143 3.0799e-04 152 0.01674411 0.61248 0.044384
## 144 3.0672e-04 153 0.01643612 0.61224 0.044353
## 145 3.0672e-04 154 0.01612940 0.61204 0.044358
## 146 3.0672e-04 155 0.01582268 0.61204 0.044358
## 147 3.0544e-04 156 0.01551596 0.61204 0.044358
## 148 3.0094e-04 157 0.01521052 0.61078 0.044326
## 149 2.9757e-04 158 0.01490958 0.61239 0.044386
## 150 2.8981e-04 159 0.01461201 0.61199 0.044376
## 151 2.8923e-04 160 0.01432220 0.61353 0.044507
## 152 2.8782e-04 161 0.01403296 0.61388 0.044529
## 153 2.8635e-04 162 0.01374515 0.61388 0.044529
## 154 2.8189e-04 163 0.01345879 0.61459 0.044523
## 155 2.8173e-04 164 0.01317690 0.61429 0.044531
## 156 2.6988e-04 165 0.01289517 0.61488 0.044514
## 157 2.6283e-04 166 0.01262530 0.61610 0.044618
## 158 2.5737e-04 167 0.01236246 0.61726 0.044720
## 159 2.5139e-04 168 0.01210509 0.61964 0.044844
## 160 2.5003e-04 169 0.01185370 0.61964 0.044844
## 161 2.3771e-04 170 0.01160367 0.62041 0.044760
## 162 2.3512e-04 171 0.01136596 0.62269 0.044849
## 163 2.2600e-04 172 0.01113084 0.62114 0.044865
## 164 2.1796e-04 173 0.01090483 0.62184 0.044850
## 165 2.1590e-04 174 0.01068688 0.62270 0.044858
## 166 2.1121e-04 175 0.01047098 0.62455 0.044862
## 167 2.0973e-04 176 0.01025977 0.62455 0.044862
## 168 2.0949e-04 178 0.00984031 0.62455 0.044862
## 169 2.0779e-04 179 0.00963081 0.62455 0.044862
## 170 2.0120e-04 180 0.00942302 0.62402 0.044853
## 171 2.0025e-04 181 0.00922182 0.62406 0.044852
## 172 1.9247e-04 182 0.00902157 0.62428 0.044869
## 173 1.8668e-04 183 0.00882910 0.62488 0.044896
## 174 1.7976e-04 184 0.00864242 0.62547 0.044896
## 175 1.6630e-04 185 0.00846266 0.62719 0.044993
## 176 1.6596e-04 186 0.00829637 0.62596 0.045001
## 177 1.6594e-04 187 0.00813041 0.62596 0.045001
## 178 1.6347e-04 188 0.00796447 0.62642 0.045011
## 179 1.6290e-04 189 0.00780100 0.62558 0.044983
## 180 1.5712e-04 190 0.00763810 0.62583 0.044988
## 181 1.5619e-04 191 0.00748098 0.62529 0.044948
## 182 1.5210e-04 192 0.00732479 0.62450 0.044936
## 183 1.4745e-04 193 0.00717270 0.62443 0.044942
## 184 1.4354e-04 194 0.00702525 0.62403 0.044911
## 185 1.3883e-04 195 0.00688171 0.62436 0.045000
## 186 1.3883e-04 196 0.00674288 0.62436 0.045000
## 187 1.3613e-04 197 0.00660405 0.62533 0.045001
## 188 1.3589e-04 198 0.00646792 0.62590 0.045124
## 189 1.3299e-04 199 0.00633203 0.62617 0.045126
## 190 1.3241e-04 200 0.00619904 0.62617 0.045126
## 191 1.3011e-04 201 0.00606664 0.62606 0.045129
## 192 1.2674e-04 202 0.00593652 0.62611 0.045112
## 193 1.2674e-04 203 0.00580978 0.62603 0.045109
## 194 1.2167e-04 204 0.00568304 0.62603 0.045109
## 195 1.2167e-04 205 0.00556136 0.62737 0.045246
## 196 1.2105e-04 206 0.00543969 0.62737 0.045246
## 197 1.1352e-04 207 0.00531864 0.62878 0.045231
## 198 1.0898e-04 208 0.00520512 0.62870 0.045255
## 199 1.0860e-04 209 0.00509614 0.62870 0.045255
## 200 1.0592e-04 210 0.00498754 0.62909 0.045247
## 201 1.0265e-04 211 0.00488162 0.62822 0.045218
## 202 9.6794e-05 212 0.00477896 0.62875 0.045293
## 203 9.5532e-05 213 0.00468217 0.62846 0.045293
## 204 9.4042e-05 214 0.00458664 0.62791 0.045302
## 205 9.1257e-05 215 0.00449260 0.62767 0.045293
## 206 9.0753e-05 216 0.00440134 0.62762 0.045294
## 207 8.9624e-05 217 0.00431059 0.62702 0.045292
## 208 8.8270e-05 218 0.00422096 0.62687 0.045295
## 209 8.7486e-05 219 0.00413269 0.62687 0.045295
## 210 8.3729e-05 220 0.00404521 0.62652 0.045288
## 211 8.1451e-05 221 0.00396148 0.62784 0.045361
## 212 7.9204e-05 222 0.00388003 0.62607 0.045283
## 213 7.7471e-05 224 0.00372162 0.62552 0.045275
## 214 7.6989e-05 225 0.00364415 0.62528 0.045272
## 215 7.4805e-05 227 0.00349017 0.62490 0.045274
## 216 7.2925e-05 228 0.00341536 0.62518 0.045290
## 217 7.2160e-05 229 0.00334244 0.62483 0.045299
## 218 7.1694e-05 230 0.00327028 0.62483 0.045299
## 219 6.9264e-05 231 0.00319859 0.62545 0.045484
## 220 6.8065e-05 232 0.00312932 0.62571 0.045557
## 221 6.8065e-05 233 0.00306126 0.62571 0.045557
## 222 6.7977e-05 234 0.00299319 0.62571 0.045557
## 223 6.6383e-05 235 0.00292522 0.62561 0.045559
## 224 6.6383e-05 236 0.00285883 0.62601 0.045557
## 225 6.6383e-05 237 0.00279245 0.62601 0.045557
## 226 6.6203e-05 238 0.00272607 0.62601 0.045557
## 227 6.5697e-05 239 0.00265986 0.62601 0.045557
## 228 6.5373e-05 240 0.00259417 0.62601 0.045557
## 229 6.4356e-05 241 0.00252879 0.62580 0.045558
## 230 6.3372e-05 242 0.00246444 0.62671 0.045648
## 231 6.2228e-05 243 0.00240107 0.62659 0.045638
## 232 6.2225e-05 244 0.00233884 0.62659 0.045638
## 233 6.0397e-05 245 0.00227661 0.62659 0.045638
## 234 5.8464e-05 246 0.00221622 0.62716 0.045637
## 235 5.8137e-05 248 0.00209929 0.62716 0.045637
## 236 5.4694e-05 249 0.00204115 0.62821 0.045680
## 237 5.2855e-05 251 0.00193176 0.62827 0.045701
## 238 5.1331e-05 252 0.00187891 0.62852 0.045696
## 239 5.1048e-05 253 0.00182758 0.62751 0.045682
## 240 4.9324e-05 255 0.00172548 0.62751 0.045682
## 241 4.9278e-05 256 0.00167616 0.62751 0.045682
## 242 4.9278e-05 257 0.00162688 0.62751 0.045682
## 243 4.9273e-05 258 0.00157760 0.62751 0.045682
## 244 4.5298e-05 259 0.00152833 0.62739 0.045681
## 245 4.3577e-05 260 0.00148303 0.62728 0.045687
## 246 4.3370e-05 261 0.00143945 0.62649 0.045699
## 247 4.2422e-05 262 0.00139608 0.62623 0.045693
## 248 4.0867e-05 263 0.00135366 0.62689 0.045733
## 249 3.9280e-05 264 0.00131279 0.62694 0.045732
## 250 3.7840e-05 265 0.00127351 0.62699 0.045728
## 251 3.7840e-05 266 0.00123567 0.62671 0.045734
## 252 3.7840e-05 267 0.00119783 0.62671 0.045734
## 253 3.6955e-05 268 0.00115999 0.62698 0.045729
## 254 3.5847e-05 269 0.00112304 0.62712 0.045728
## 255 3.5216e-05 270 0.00108719 0.62777 0.045734
## 256 3.4708e-05 271 0.00105197 0.62794 0.045733
## 257 3.4032e-05 272 0.00101727 0.62776 0.045727
## 258 3.3519e-05 273 0.00098323 0.62776 0.045727
## 259 3.3247e-05 274 0.00094971 0.62759 0.045732
## 260 2.9981e-05 275 0.00091647 0.62759 0.045732
## 261 2.9052e-05 276 0.00088649 0.62750 0.045734
## 262 2.7245e-05 277 0.00085744 0.62795 0.045733
## 263 2.5663e-05 278 0.00083019 0.62770 0.045743
## 264 2.5663e-05 279 0.00080453 0.62780 0.045744
## 265 2.2814e-05 280 0.00077886 0.62799 0.045755
## 266 2.2688e-05 281 0.00075605 0.62973 0.045861
## 267 2.2128e-05 282 0.00073336 0.63014 0.045858
## 268 2.1877e-05 283 0.00071123 0.63014 0.045858
## 269 2.1510e-05 284 0.00068936 0.63014 0.045858
## 270 2.0132e-05 285 0.00066785 0.62929 0.045800
## 271 2.0132e-05 286 0.00064772 0.62913 0.045800
## 272 1.8231e-05 287 0.00062758 0.62979 0.045905
## 273 1.8163e-05 288 0.00060935 0.62979 0.045905
## 274 1.7618e-05 289 0.00059119 0.62979 0.045905
## 275 1.7618e-05 290 0.00057357 0.62990 0.045898
## 276 1.7608e-05 291 0.00055595 0.62990 0.045898
## 277 1.7110e-05 292 0.00053834 0.63000 0.045897
## 278 1.5272e-05 293 0.00052123 0.63000 0.045897
## 279 1.5099e-05 294 0.00050596 0.63067 0.045888
## 280 1.4162e-05 296 0.00047576 0.63043 0.045884
## 281 1.4162e-05 297 0.00046160 0.63055 0.045882
## 282 1.4141e-05 298 0.00044744 0.63055 0.045882
## 283 1.4141e-05 300 0.00041916 0.63055 0.045882
## 284 1.3214e-05 301 0.00040502 0.62992 0.045881
## 285 1.3214e-05 302 0.00039180 0.62977 0.045870
## 286 1.3093e-05 303 0.00037859 0.62977 0.045870
## 287 1.2318e-05 304 0.00036550 0.62974 0.045869
## 288 1.2318e-05 305 0.00035318 0.62997 0.045873
## 289 1.1454e-05 306 0.00034086 0.62997 0.045873
## 290 1.1082e-05 307 0.00032941 0.63011 0.045884
## 291 1.0621e-05 308 0.00031832 0.62977 0.045928
## 292 1.0000e-05 312 0.00027584 0.62977 0.045928
plotcp(tree1)
On choisit l’arbre qui a la plus petite erreur de prévision.
cp_opt <- tree1$cptable %>% as.data.frame() %>% dplyr::filter(xerror==min(xerror)) %>% dplyr::select(CP) %>% as.numeric()
opt.tree <- prune(tree,cp=cp_opt)
rpart.plot(opt.tree)
id.new <- sample(nrow(Carseats),10)
new.x <- Carseats %>% slice(id.new) %>% select(-Sales)
Calculer les valeurs de Sales prédites par l’arbre construit.
predict(opt.tree,newdata=new.x)
## 1 2 3 4 5 6 7
## 5.786500 6.626512 6.230000 8.396667 5.786500 10.730000 3.767200
## 8 9 10
## 12.187857 7.603333 9.828889
On considère le jeu de données spam du package kernlab.
library(kernlab)
data(spam)
set.seed(1234)
spam <- spam[sample(nrow(spam)),]
Le problème est d’expliquer la variable binaire type par les autres.
library(randomForest)
rf1 <- randomForest(type~.,data=spam)
plot(rf1)
Ce graphe permet de visualiser l’erreur de classication ainsi que les taux de faux positifs et faux négatifs calculés par Out Of Bag en fonction du nombre d’arbres de la forêt. Ce graphe peut être utilisé pour voir si l’algorithme a bien “convergé”. Si ce n’est pas le cas, il faut construire une forêt avec plus d’abres.
rf2 <- randomForest(type~.,data=spam,mtry=1)
rf1
##
## Call:
## randomForest(formula = type ~ ., data = spam)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 4.67%
## Confusion matrix:
## nonspam spam class.error
## nonspam 2710 78 0.02797704
## spam 137 1676 0.07556536
rf2
##
## Call:
## randomForest(formula = type ~ ., data = spam, mtry = 1)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 7.56%
## Confusion matrix:
## nonspam spam class.error
## nonspam 2725 63 0.02259684
## spam 285 1528 0.15719801
La forêt rf1
est plus performante en terme d’erreur de classification OOB.
library(caret)
grille.mtry <- data.frame(mtry=seq(1,30,by=5))
ctrl <- trainControl(method="oob")
library(doParallel) ## pour paralléliser
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
set.seed(12345)
sel.mtry <- train(type~.,data=spam,method="rf",trControl=ctrl,tuneGrid=grille.mtry)
on.exit(stopCluster(cl))
On choisit
sel.mtry$bestTune
rf3 <- randomForest(type~.,data=spam,mtry=unlist(sel.mtry$bestTune),importance=TRUE)
Imp <- importance(rf3,type=1) %>% as.data.frame() %>% mutate(variable=names(spam)[-58]) %>% arrange(desc(MeanDecreaseAccuracy))
head(Imp)
ggplot(Imp) + aes(x=reorder(variable,MeanDecreaseAccuracy),y=MeanDecreaseAccuracy)+geom_bar(stat="identity")+coord_flip()+xlab("")+theme_classic()
library(ranger)
system.time(rf4 <- ranger(type~.,data=spam))
## user system elapsed
## 2.554 0.028 0.767
system.time(rf5 <- randomForest(type~.,data=spam))
## user system elapsed
## 8.620 0.036 8.771
Le temps de calcul est plus rapide avec ranger. Ce package permet une implémentation efficace des forêts aléatoires pour des données de grande dimension. on peut touver plus d’information ici.
On considère toujours le jeu de données spam du package kernlab.
library(gbm)
model_ada1 <- gbm(type~.,data=spam,distribution="adaboost",interaction.depth=2,shrinkage=0.05,n.trees=500)
Il est nécessaire que la variable qualitative à expliquer soit codée 0-1 pour adaboost
spam1 <- spam
spam1$type <- as.numeric(spam1$type)-1
set.seed(1234)
model_ada1 <- gbm(type~.,data=spam1,distribution="adaboost",interaction.depth=2,shrinkage=0.05,n.trees=500)
L’algorithme gbm est une descente de gradient qui minimise la fonction de perte \[\frac{1}{n}\sum_{i=1}^n \ell(y_i,g(x_i)).\] Dans le cas de adaboost on utilise la perte exponentielle : \(\ell(y,g(x))=\exp(-yg(x))\).
On effectue un résumé du modèle :
summary(model_ada1)
On obtient un indicateur qui permet de mesurer l’importance des variable dans la construction de la méthode.
model_ada2 <- gbm(type~.,data=spam1,distribution="adaboost",interaction.depth=2,bag.fraction=1,cv.folds = 5,n.trees=500)
gbm.perf(model_ada2)
## [1] 173
model_ada3 <- gbm(type~.,data=spam1,distribution="adaboost",interaction.depth=2,bag.fraction=1,cv.folds = 5,n.trees=500,shrinkage=0.05)
gbm.perf(model_ada3)
## [1] 440
model_ada4 <- gbm(type~.,data=spam1,distribution="adaboost",interaction.depth=2,bag.fraction=1,cv.folds = 5,n.trees=500,shrinkage=0.5)
gbm.perf(model_ada4)
## [1] 31
Le nombre d’itérations optimal augmente lorsque shrinkage diminue. C’est logique car ce dernier paramètre controle la vitesse de descente de gradient : plus il est grand, plus on minimise vite et moins on itère. Il faut néanmoins veiller à ne pas le prendre trop petit pour avoir un estimateur stable. Ici, 0.05 semble être une bonne valeur.
Séparer le jeu de données spam en un échantillon d’apprentissage de taille 3000 et un échantillon test qui comprendra le reste des observations. Sur l’échantillon d’apprentissage uniquement, on constuira une règle de classification et un score en utilisant :
On sépare les données
library(kernlab)
data(spam)
set.seed(123)
ind.app <- sample(nrow(spam),3000)
dapp <- spam %>% slice(ind.app)
dtest <- spam %>% slice(-ind.app)
library(rpart)
library(rpart.plot)
arbre <- rpart(type~.,data=dapp,cp=0.00001,minsplit=3)
plotcp(arbre)
cp_opt <- arbre$cptable[which.min(arbre$cptable[,"xerror"]),"CP"]
arbre_sel <- prune(arbre,cp=cp_opt)
rpart.plot(arbre_sel)
score <- data.frame(arbre=predict(arbre_sel,newdata=dtest,type="prob")[,2])
library(glmnet)
dapp1 <- model.matrix(type~.,data=dapp)[,-1]
Yapp1 <- as.factor(as.numeric(dapp$type)-1)
lasso.cv <- cv.glmnet(dapp1,Yapp1,alpha=1,family="binomial")
plot(lasso.cv)
dtest1 <- model.matrix(type~.,data=dtest)[,-1]
Ytest1 <- as.factor(as.numeric(dtest$type)-1)
score.lasso <- predict(lasso.cv,newx=dtest1,type="response") %>% unlist() %>% as.numeric()
score <- score %>% mutate(lasso=score.lasso)
C <- c(0.001,0.01,1,10,100,1000)
C <- c(1,10)
gr <- expand.grid(C=C)
ctrl <- trainControl(method="cv")
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
set.seed(12345)
svm.lin <- train(type~.,data=dapp,method="svmLinear",trControl=ctrl,tuneGrid=gr,prob.model=TRUE)
on.exit(stopCluster(cl))
C <- c(0.001,0.01,1,100,1000)
sigma <- c(0.05,0.1,0.5,1,5)
gr <- expand.grid(C=C,sigma=sigma)
ctrl <- trainControl(method="cv")
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
set.seed(12345)
svm.rad <- train(type~.,data=dapp,method="svmRadial",trControl=ctrl,tuneGrid=gr,prob.model=TRUE)
on.exit(stopCluster(cl))
score <- score %>% mutate(svm.lin=predict(svm.lin,newdata=dtest,type="prob")[,2],
svm.rad=predict(svm.rad,newdata=dtest,type="prob")[,2])
library(gbm)
dapp2 <- dapp
dtest2 <- dtest
dapp2$type <- as.numeric(dapp2$type)-1
dtest2$type <- as.numeric(dtest2$type)-1
ada <- gbm(type~.,data=dapp2,distribution="adaboost",interaction.depth=2,shrinkage=0.05,cv.folds=5,bag.fraction=1,n.trees=500)
Mopt.ada <- gbm.perf(ada,meth="cv")
logit <- gbm(type~.,data=dapp2,distribution="bernoulli",interaction.depth=2,shrinkage=0.1,cv.folds=5,bag.fraction=1,n.trees=1000)
Mopt.logit <- gbm.perf(logit,meth="cv")
score <- score %>% mutate(ada=predict(ada,newdata=dtest,n.trees=Mopt.ada,type="response"),
logit=predict(logit,newdata=dtest,n.trees=Mopt.logit,type="response"))
library(randomForest)
foret <- randomForest(type~.,data=dapp,xtest=dtest[,-ncol(dtest)],ytest=dtest[,ncol(dtest)],keep.forest=TRUE)
score <- score %>% mutate(foret=foret$test$vote[,2])
Comparaison des méthodes
On créé une table qui contient toutes les informations pur calculer les critères.
score1 <- score %>% mutate(obs=dtest$type) %>% gather(key="Method",value="Score",-obs) %>%
mutate(Prev=recode(as.character(Score>0.5),"TRUE"="spam","FALSE"="nonspam"))
On en déduit :
score1 %>% group_by(Method) %>% summarise(Err=mean(obs!=Prev)) %>% arrange(Err)
score1 %>% group_by(Method) %>% summarize(AUC=pROC::auc(obs,Score)) %>% arrange(desc(AUC))
library(plotROC)
ggplot(score1)+aes(d=obs,m=Score,color=Method)+geom_roc()+theme_classic()