기계학습(Machine Learning) - caret R package

2018/05/28

기계학습이 여기저기서 들려오는데 막상 R을 사용하면서도 아직 개념을 잘 이해하지 못하고 있어서 여기에 정리해 본다.

Gitbook

Gitbook

위 두개의 링크를 참고하였다.

library(tidyverse) # for tidy tools (pipe operation, tibble, etc..)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## √ ggplot2 2.2.1.9000     √ purrr   0.2.4     
## √ tibble  1.4.2          √ dplyr   0.7.5     
## √ tidyr   0.8.1          √ stringr 1.3.1     
## √ readr   1.1.1          √ forcats 0.3.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::vars()   masks ggplot2::vars()
library(caret)
## 필요한 패키지를 로딩중입니다: lattice
## 
## 다음의 패키지를 부착합니다: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(1234) # for reproducibility

data(Sonar, package = "mlbench")
Sonar <- Sonar %>% tbl_df
Sonar
## # A tibble: 208 x 61
##        V1     V2     V3     V4     V5     V6     V7     V8     V9    V10
##  *  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 0.02   0.0371 0.0428 0.0207 0.0954 0.0986 0.154  0.160  0.311  0.211 
##  2 0.0453 0.0523 0.0843 0.0689 0.118  0.258  0.216  0.348  0.334  0.287 
##  3 0.0262 0.0582 0.110  0.108  0.0974 0.228  0.243  0.377  0.560  0.619 
##  4 0.01   0.0171 0.0623 0.0205 0.0205 0.0368 0.110  0.128  0.0598 0.126 
##  5 0.0762 0.0666 0.0481 0.0394 0.059  0.0649 0.121  0.247  0.356  0.446 
##  6 0.0286 0.0453 0.0277 0.0174 0.0384 0.099  0.120  0.183  0.210  0.304 
##  7 0.0317 0.0956 0.132  0.141  0.167  0.171  0.0731 0.140  0.208  0.351 
##  8 0.0519 0.0548 0.0842 0.0319 0.116  0.0922 0.103  0.0613 0.146  0.284 
##  9 0.0223 0.0375 0.0484 0.0475 0.0647 0.0591 0.0753 0.0098 0.0684 0.149 
## 10 0.0164 0.0173 0.0347 0.007  0.0187 0.0671 0.106  0.0697 0.0962 0.0251
## # ... with 198 more rows, and 51 more variables: V11 <dbl>, V12 <dbl>,
## #   V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## #   V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## #   V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## #   V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## #   V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## #   V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## #   V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## #   V55 <dbl>, V56 <dbl>, V57 <dbl>, V58 <dbl>, V59 <dbl>, V60 <dbl>,
## #   Class <fct>
indexTrain <- sample(1:nrow(Sonar), round(nrow(Sonar) * .7))
training <- Sonar[ indexTrain, ]
testing  <- Sonar[-indexTrain, ]
training
## # A tibble: 146 x 61
##        V1     V2     V3     V4     V5     V6     V7     V8     V9    V10
##     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 0.0115 0.015  0.0136 0.0076 0.0211 0.106  0.102  0.044  0.0931 0.0734
##  2 0.0374 0.0586 0.0628 0.0534 0.0255 0.142  0.207  0.273  0.307  0.260 
##  3 0.0228 0.0853 0.1    0.0428 0.112  0.165  0.160  0.212  0.330  0.352 
##  4 0.0209 0.0261 0.012  0.0768 0.106  0.168  0.302  0.346  0.331  0.412 
##  5 0.0294 0.0123 0.0117 0.0113 0.0497 0.0998 0.133  0.112  0.298  0.347 
##  6 0.137  0.123  0.138  0.148  0.178  0.143  0.177  0.216  0.163  0.207 
##  7 0.0453 0.0523 0.0843 0.0689 0.118  0.258  0.216  0.348  0.334  0.287 
##  8 0.0308 0.0339 0.0202 0.0889 0.157  0.175  0.092  0.135  0.159  0.280 
##  9 0.079  0.0707 0.0352 0.166  0.133  0.0226 0.0771 0.268  0.566  0.661 
## 10 0.0587 0.121  0.127  0.150  0.144  0.0561 0.0832 0.0672 0.137  0.235 
## # ... with 136 more rows, and 51 more variables: V11 <dbl>, V12 <dbl>,
## #   V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## #   V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## #   V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## #   V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## #   V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## #   V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## #   V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## #   V55 <dbl>, V56 <dbl>, V57 <dbl>, V58 <dbl>, V59 <dbl>, V60 <dbl>,
## #   Class <fct>
testing
## # A tibble: 62 x 61
##        V1     V2      V3     V4     V5     V6     V7     V8     V9    V10
##     <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 0.02   0.0371 0.0428  0.0207 0.0954 0.0986 0.154  0.160  0.311  0.211 
##  2 0.0762 0.0666 0.0481  0.0394 0.059  0.0649 0.121  0.247  0.356  0.446 
##  3 0.0286 0.0453 0.0277  0.0174 0.0384 0.099  0.120  0.183  0.210  0.304 
##  4 0.0039 0.0063 0.0152  0.0336 0.031  0.0284 0.0396 0.0272 0.0323 0.0452
##  5 0.0124 0.0433 0.0604  0.0449 0.0597 0.0355 0.0531 0.0343 0.105  0.212 
##  6 0.027  0.0092 0.0145  0.0278 0.0412 0.0757 0.103  0.114  0.0794 0.152 
##  7 0.0177 0.03   0.0288  0.0394 0.063  0.0526 0.0688 0.0633 0.0624 0.0613
##  8 0.01   0.0275 0.019   0.0371 0.0416 0.0201 0.0314 0.0651 0.190  0.267 
##  9 0.024  0.0218 0.0324  0.0569 0.033  0.0513 0.0897 0.0713 0.0569 0.0389
## 10 0.0195 0.0213 0.00580 0.019  0.0319 0.0571 0.100  0.0668 0.0691 0.0242
## # ... with 52 more rows, and 51 more variables: V11 <dbl>, V12 <dbl>,
## #   V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## #   V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## #   V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## #   V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## #   V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## #   V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## #   V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## #   V55 <dbl>, V56 <dbl>, V57 <dbl>, V58 <dbl>, V59 <dbl>, V60 <dbl>,
## #   Class <fct>
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

# RandomForest 로 학습시켜보기
rf_fit <- train(Class ~ ., data = training, method = "rf", trControl = fitControl, verbose = F)
rf_fit
## Random Forest 
## 
## 146 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 132, 132, 131, 131, 130, 132, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8197381  0.6378550
##   31    0.7972381  0.5938628
##   60    0.7805714  0.5598370
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.