notebooks/ccn2019-correct.R at 8101323353b52ed1e9221d5b27b06303bf97fb05

Fork: 0
morteza / notebooks
Find file
Newer
Older
notebooks / ccn2019-correct.R
Morteza Ansarinia on 28 May 2019 3 KB init a new dummy var analysis for ccn2019
Raw Blame History
#==================================================#
# model the "correct" column

library(here)
library(tidyverse)
library(caret)
library(inspectdf)
library(pls)

#devtools::install_github("sachsmc/plotROC")
library(plotROC)


load(here("notebooks/data/nback_seqs.Rd"))

set.seed(42)

seqs.imputed <- seqs %>% 
  filter(!is.na(correct), !is.na(rt)) %>%
  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))

inspect_cat(seqs.imputed)
inspect_num(seqs.imputed)

seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)


train_indexes <- createDataPartition(seqs.imputed$correct,
                                     times = 1,
                                     p = 0.7,
                                     list = F)

train_data <- seqs.imputed[train_indexes,]
test_data <- seqs.imputed[-train_indexes,]

control <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = T,
  verboseIter = T,
  savePredictions = T,
  sampling = "down",
  selectionFunction = "oneSE"
)

pls.new_model <- train(
  correct ~ .-a-al-dp-cr-rt,
  data = train_data,
  method = "pls",
  metric = "Accuracy",
  tuneLength = 20,
  preProcess = c("zv","center","scale"),
  trControl = control
)

plot(pls.new_model)
plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")

pls.common_model <- train(
  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
  data = train_data,
  method = "pls",
  metric = "Accuracy",
  tuneLength = 20,
  preProcess = c("zv","center","scale"),
  trControl = control
)

plot(pls.common_model)
plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")


#trellis.par.set(caretTheme())
#densityplot(pls.new_model, pch = "|")
#densityplot(pls.common_model, pch = "|")

# Compile models and compare performance
pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
#DEBUG summary(pls.models)
#DEBUG dotplot(pls.models)
#DEBUG diffValues <- diff(resamps)
bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")


pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")

confusionMatrix(pls.new_predicted, test_data$correct)
confusionMatrix(pls.common_predicted, test_data$correct)

library(pROC)
par(pty="s")

roc(test_data$correct,
    pls.common_predicted_prob$CORRECT,
    plot = T,
    legacy.axes=T,
    lwd=2,
    col="darkgrey",
    lty = 3,
    print.auc = T,
    print.auc.y = 45,
    print.auc.x = 55,
    percent = T,
    ci = T,
    boot.n = 100
    )



# roc_test_indices <- createDataPartition(test_data$correct,
#                                      times = 10,
#                                      p = 0.9,
#                                      list = F)

#for (i in 1:ncol(roc_test_indices)) {
#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
  
#  plot.roc(test_sample_correct,
#           predprob_sample_correct,
roc(test_data$correct,
    pls.new_predicted_prob$CORRECT,
    legacy.axes=T,
    plot = T,
    lwd=2,
    col="black",
    print.auc=T,
    percent = T,
    print.auc.y = 40,
    print.auc.x = 55,
    lty = 1,
    add=T,
    of = "se",
    boot.n = 100,
    ci = T)

#}

legend(100,100, legend=c("New Model", "Common Model"),
       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)

# requires plotROC package
#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
#DEBUG   geom_roc()