3. embedded method
- filter method + wrapper method
- 두 방법을 결합하여 어떤 변수가 가장 크게 기여하는지 찾아내는 방법
- 과적합(overfitting)을 줄이기 위해 내부적으로 규제를 가하는 방식 사용
library(klaR)
library(MASS)
library(caret)
library(randomForest)
data <- read.csv("radiomics_fundus_ds.csv")
data <- data[-2]
# 모든 feature를 갖는 변수
full <- lm(cl~., data)
# feature가 없는 변수
null <- lm(cl~1, data)
# Forward selection
forward <- step(null, direction="forward",
scope=list(lower=null, upper=full))
# Backward elimination
backward <- step(full, direction="backward")
# Stepwise selection
stepwise <- step(null, direction="both",
scope=list(upper=full))
- check the selected features
forward_col <- rownames(data.frame(forward[["coefficients"]][-1]))
backward_col <- rownames(data.frame(backward[["coefficients"]][-1]))
stepwise_col <- rownames(data.frame(stepwise[["coefficients"]][-1]))
forward_col
## [1] "glcm_inverseVariance" "glcm_mean" "calc_meanDeviation"
## [4] "glrlm_SRHGLE" "glcm_cShade" "glcm_cProminence"
## [7] "glszm_IV" "glrlm_SRE" "glszm_HISAE"
## [10] "glszm_HIE" "glcm_differenceEntropy" "glszm_ZP"
## [13] "glrlm_LGLRE"
backward_col
## [1] "calc_energy" "calc_entropy" "calc_skewness"
## [4] "calc_min" "calc_variance" "calc_RMS"
## [7] "calc_sd" "glcm_variance" "glcm_cShade"
## [10] "glcm_cTendency" "glcm_correlation" "glcm_dissimilarity"
## [13] "glcm_homogeneity1" "glcm_homogeneity2" "glcm_inverseVariance"
## [16] "glrlm_GLN" "glrlm_LRE" "glrlm_LRLGLE"
## [19] "glrlm_LGLRE" "glrlm_SRE" "glszm_SAE"
## [22] "glszm_SZV" "glszm_ZP" "glszm_HIE"
## [25] "glszm_LISAE" "glszm_HISAE" "glszm_LILAE"
## [28] "glszm_HILAE"
stepwise_col
## [1] "glcm_inverseVariance" "calc_meanDeviation" "glcm_cShade"
## [4] "glcm_cProminence" "glszm_IV" "glrlm_SRE"
## [7] "glszm_HISAE" "glszm_HIE" "glcm_IDN"
## [10] "glszm_ZP" "glrlm_LGLRE" "glszm_LAE"
data[, 1] <- as.factor(data[, 1])
set.seed(100)
indexTrain <- createDataPartition(data$cl, p=.7, list=FALSE)
train_dt <- data[indexTrain, ]
test_dt <- data[-indexTrain, ]
fitControl <- trainControl(method="repeatedcv", number=5, repeats=5)
forward <- function(x) {
train_dt <- data[indexTrain, c("cl", forward_col)]
test_dt <- data[-indexTrain, c("cl", forward_col)]
model <- train(cl~., data=train_dt, method=x, trControl=fitControl, verbose=FALSE)
pred <- predict(model, test_dt)
matrix <- confusionMatrix(pred, test_dt$cl)
return(matrix)
}
backward <- function(x) {
train_dt <- data[indexTrain, c("cl", backward_col)]
test_dt <- data[-indexTrain, c("cl", backward_col)]
model <- train(cl~., data=train_dt, method=x, trControl=fitControl, verbose=FALSE)
pred <- predict(model, test_dt)
matrix <- confusionMatrix(pred, test_dt$cl)
return(matrix)
}
stepwise <- function(x) {
train_dt <- data[indexTrain, c("cl", stepwise_col)]
test_dt <- data[-indexTrain, c("cl", stepwise_col)]
model <- train(cl~., data=train_dt, method=x, trControl=fitControl, verbose=FALSE)
pred <- predict(model, test_dt)
matrix <- confusionMatrix(pred, test_dt$cl)
return(matrix)
}
forward_mat <- forward("nb")
forward_mat$overall["Accuracy"]
## Accuracy
## 0.6190476
backward_mat <- backward("nb")
backward_mat$overall["Accuracy"]
## Accuracy
## 0.6369048
stepwise_mat <- stepwise("nb")
stepwise_mat$overall["Accuracy"]
## Accuracy
## 0.6309524
result_nb <- cbind(forward_mat$overall["Accuracy"],
backward_mat$overall["Accuracy"],
stepwise_mat$overall["Accuracy"])
colnames(result_nb) <- c("forward", "backward", "stepwise")
result_nb
## forward backward stepwise
## Accuracy 0.6190476 0.6369048 0.6309524
forward_mat <- forward("rf")
forward_mat$overall["Accuracy"]
## Accuracy
## 0.6488095
backward_mat <- backward("rf")
backward_mat$overall["Accuracy"]
## Accuracy
## 0.6428571
stepwise_mat <- stepwise("rf")
stepwise_mat$overall["Accuracy"]
## Accuracy
## 0.6428571
result_rf <- cbind(forward_mat$overall["Accuracy"],
backward_mat$overall["Accuracy"],
stepwise_mat$overall["Accuracy"])
colnames(result_rf) <- c("forward", "backward", "stepwise")
result_rf
## forward backward stepwise
## Accuracy 0.6488095 0.6428571 0.6428571
forward_mat <- forward("svmLinear")
forward_mat$overall["Accuracy"]
## Accuracy
## 0.7142857
backward_mat <- backward("svmLinear")
backward_mat$overall["Accuracy"]
## Accuracy
## 0.7083333
stepwise_mat <- stepwise("svmLinear")
stepwise_mat$overall["Accuracy"]
## Accuracy
## 0.6964286
result_svm <- cbind(forward_mat$overall["Accuracy"],
backward_mat$overall["Accuracy"],
stepwise_mat$overall["Accuracy"])
colnames(result_svm) <- c("forward", "backward", "stepwise")
result_svm
## forward backward stepwise
## Accuracy 0.7142857 0.7083333 0.6964286