Newer
Older
notebooks / ccn2019-accuracy.R
#==================================================#
# model the "accuract" column (a for global, and al for local accuracy)

library(here)
library(tidyverse)
library(caret)
library(inspectdf)
library(ROSE)

load(here("notebooks/data/nback_seqs.Rd"))

set.seed(42)

seqs.imputed <- seqs %>% 
  filter(!is.na(correct), !is.na(rt)) %>%
  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))

inspect_num(seqs.imputed)

seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)


train_indexes <- createDataPartition(seqs.imputed$correct,
                                     times = 1,
                                     p = 0.7,
                                     list = F)

train_data <- seqs.imputed[train_indexes,]
test_data <- seqs.imputed[-train_indexes,]

train_data.imbalanced <- ROSE(correct ~ .,
                              data = train_data,
                              seed = 1)$data

control <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  verboseIter = T
)

pls.new_model <- train(
  a ~ .-al-dp-cr-rt-correct,
  data = train_data.imbalanced,
  method = "pls",
  preProcess = c("center","scale"),
  trControl = control
)

plot(varImp(pls.new_model), main="Variable Importance for Accuracy")


pls.old_model <- train(
  a ~ t + n + v,
  data = train_data,
  method = "pls",
  preProcess = c("center","scale"),
  trControl = control
)


pls.old_model
pls.new_model
plot(varImp(pls.old_model))

trellis.par.set(caretTheme())
densityplot(pls.new_model, pch = "|")
densityplot(pls.old_model, pch = "|")

resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
summary(resamps)
dotplot(resamps, metric = "Rsquared")
difValues <- diff(resamps)
bwplot(difValues, layout=c(1,3))


pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
pls.old_predicted <- predict(pls.old_model, test_data, type="raw")


summary(pls.new_model)


# SSE and RMSE

SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
R_square <- 1 - SSE/SST
SSE <- sum((test_data$a - pls.new_predicted)^2)
RMSE <- sqrt(SSE/length(pls.new_predicted))


SSE <- sum((test_data$a - pls.old_predicted)^2)
R_square <- 1 - SSE/SST
SSE <- sum((test_data$a - pls.old_predicted)^2)
RMSE <- sqrt(SSE/length(pls.old_predicted))


as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
  ggplot(aes(predicted, observed)) +
    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
    geom_point(alpha = 0.1,shape=16) + 
    geom_smooth(method=lm,se=F) +
    ggtitle("Accuracy: Predicted vs Actual") +
    xlab("Predecited") +
    ylab("Observed")