Dataset

Requirements

Feature selection

1. filter method

  • 각각의 변수들에 대해 통계적인 점수 부여
  • 부여된 점수를 바탕으로 변수의 순위를 매기고 변수 선택

2. wrapper method

  • 변수의 일부만을 모델링에 사용한 후 그 결과를 평가하는 작업을 반복하면서 변수 선택
  • 가장 좋은 성능을 보이는 subset을 뽑아내는 방법

forward selection

  • 변수가 없는 상태로 시작
  • 반복할 때마다 가장 중요한 변수 추가

backward elimination

  • 모든 변수를 가지고 시작
  • 가장 덜 중요한 변수를 하나씩 제거

stepwise selection

  • forward selection + backward elimination

3. embedded method

  • filter method + wrapper method
  • 두 방법을 결합하여 어떤 변수가 가장 크게 기여하는지 찾아내는 방법
  • 과적합(overfitting)을 줄이기 위해 내부적으로 규제를 가하는 방식 사용
library(klaR)
library(MASS)
library(caret)
library(randomForest)

data <- read.csv("radiomics_fundus_ds.csv")
data <- data[-2]

# 모든 feature를 갖는 변수
full <- lm(cl~., data)
# feature가 없는 변수
null <- lm(cl~1, data)

# Forward selection
forward <- step(null, direction="forward",
                scope=list(lower=null, upper=full))
# Backward elimination
backward <- step(full, direction="backward")
# Stepwise selection
stepwise <- step(null, direction="both",
                 scope=list(upper=full))
  • check the selected features
forward_col <- rownames(data.frame(forward[["coefficients"]][-1]))
backward_col <- rownames(data.frame(backward[["coefficients"]][-1]))
stepwise_col <- rownames(data.frame(stepwise[["coefficients"]][-1]))

forward_col
##  [1] "glcm_inverseVariance"   "glcm_mean"              "calc_meanDeviation"    
##  [4] "glrlm_SRHGLE"           "glcm_cShade"            "glcm_cProminence"      
##  [7] "glszm_IV"               "glrlm_SRE"              "glszm_HISAE"           
## [10] "glszm_HIE"              "glcm_differenceEntropy" "glszm_ZP"              
## [13] "glrlm_LGLRE"
backward_col
##  [1] "calc_energy"          "calc_entropy"         "calc_skewness"       
##  [4] "calc_min"             "calc_variance"        "calc_RMS"            
##  [7] "calc_sd"              "glcm_variance"        "glcm_cShade"         
## [10] "glcm_cTendency"       "glcm_correlation"     "glcm_dissimilarity"  
## [13] "glcm_homogeneity1"    "glcm_homogeneity2"    "glcm_inverseVariance"
## [16] "glrlm_GLN"            "glrlm_LRE"            "glrlm_LRLGLE"        
## [19] "glrlm_LGLRE"          "glrlm_SRE"            "glszm_SAE"           
## [22] "glszm_SZV"            "glszm_ZP"             "glszm_HIE"           
## [25] "glszm_LISAE"          "glszm_HISAE"          "glszm_LILAE"         
## [28] "glszm_HILAE"
stepwise_col
##  [1] "glcm_inverseVariance" "calc_meanDeviation"   "glcm_cShade"         
##  [4] "glcm_cProminence"     "glszm_IV"             "glrlm_SRE"           
##  [7] "glszm_HISAE"          "glszm_HIE"            "glcm_IDN"            
## [10] "glszm_ZP"             "glrlm_LGLRE"          "glszm_LAE"
data[, 1] <- as.factor(data[, 1])

set.seed(100)
indexTrain <- createDataPartition(data$cl, p=.7, list=FALSE)
train_dt <- data[indexTrain, ]
test_dt <- data[-indexTrain, ]

fitControl <- trainControl(method="repeatedcv", number=5, repeats=5)

forward <- function(x) {
    train_dt <- data[indexTrain, c("cl", forward_col)]
    test_dt <- data[-indexTrain, c("cl", forward_col)]
    
    model <- train(cl~., data=train_dt, method=x, trControl=fitControl, verbose=FALSE)
    pred <- predict(model, test_dt)
    matrix <- confusionMatrix(pred, test_dt$cl)
    
    return(matrix)
}

backward <- function(x) {
    train_dt <- data[indexTrain, c("cl", backward_col)]
    test_dt <- data[-indexTrain, c("cl", backward_col)]
    
    model <- train(cl~., data=train_dt, method=x, trControl=fitControl, verbose=FALSE)
    pred <- predict(model, test_dt)
    matrix <- confusionMatrix(pred, test_dt$cl)
    
    return(matrix)
}

stepwise <- function(x) {
    train_dt <- data[indexTrain, c("cl", stepwise_col)]
    test_dt <- data[-indexTrain, c("cl", stepwise_col)]
    
    model <- train(cl~., data=train_dt, method=x, trControl=fitControl, verbose=FALSE)
    pred <- predict(model, test_dt)
    matrix <- confusionMatrix(pred, test_dt$cl)
    
    return(matrix)
}
  • Naive Bayes
forward_mat <- forward("nb")
forward_mat$overall["Accuracy"]
##  Accuracy 
## 0.6190476
backward_mat <- backward("nb")
backward_mat$overall["Accuracy"]
##  Accuracy 
## 0.6369048
stepwise_mat <- stepwise("nb")
stepwise_mat$overall["Accuracy"]
##  Accuracy 
## 0.6309524
result_nb <- cbind(forward_mat$overall["Accuracy"], 
                   backward_mat$overall["Accuracy"], 
                   stepwise_mat$overall["Accuracy"])
colnames(result_nb) <- c("forward", "backward", "stepwise")
result_nb
##            forward  backward  stepwise
## Accuracy 0.6190476 0.6369048 0.6309524
  • Random Forest
forward_mat <- forward("rf")
forward_mat$overall["Accuracy"]
##  Accuracy 
## 0.6488095
backward_mat <- backward("rf")
backward_mat$overall["Accuracy"]
##  Accuracy 
## 0.6428571
stepwise_mat <- stepwise("rf")
stepwise_mat$overall["Accuracy"]
##  Accuracy 
## 0.6428571
result_rf <- cbind(forward_mat$overall["Accuracy"], 
                   backward_mat$overall["Accuracy"], 
                   stepwise_mat$overall["Accuracy"])
colnames(result_rf) <- c("forward", "backward", "stepwise")
result_rf
##            forward  backward  stepwise
## Accuracy 0.6488095 0.6428571 0.6428571
  • Support Vector Machine
forward_mat <- forward("svmLinear")
forward_mat$overall["Accuracy"]
##  Accuracy 
## 0.7142857
backward_mat <- backward("svmLinear")
backward_mat$overall["Accuracy"]
##  Accuracy 
## 0.7083333
stepwise_mat <- stepwise("svmLinear")
stepwise_mat$overall["Accuracy"]
##  Accuracy 
## 0.6964286
result_svm <- cbind(forward_mat$overall["Accuracy"], 
                    backward_mat$overall["Accuracy"], 
                    stepwise_mat$overall["Accuracy"])
colnames(result_svm) <- c("forward", "backward", "stepwise")
result_svm
##            forward  backward  stepwise
## Accuracy 0.7142857 0.7083333 0.6964286

Result

rownames(result_nb) <- "Naive Bayes"
rownames(result_rf) <- "Random Forest"
rownames(result_svm) <- "SVM"

rbind(result_nb, result_rf, result_svm)
##                 forward  backward  stepwise
## Naive Bayes   0.6190476 0.6369048 0.6309524
## Random Forest 0.6488095 0.6428571 0.6428571
## SVM           0.7142857 0.7083333 0.6964286