#==================================================#
# model the "accuract" column (a for global, and al for local accuracy)
library(here)
library(tidyverse)
library(caret)
library(inspectdf)
library(skimr)
library(ROSE)
load(here("notebooks/data/nback_seqs.Rd"))
set.seed(42)
seqs.imputed <- seqs %>%
filter(!is.na(correct), !is.na(rt)) %>%
mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
#DEBUG inspect_num(seqs.imputed)
#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
train_indexes <- createDataPartition(seqs.imputed$correct,
times = 1,
p = 0.7,
list = F)
train_data <- seqs.imputed[train_indexes,]
test_data <- seqs.imputed[-train_indexes,]
train_data.imbalanced <- ROSE(correct ~ .,
data = train_data,
seed = 1)$data
# VIsualize split
train_data.imbalanced$grp <- "train"
test_data$grp <- "test"
rbind(train_data.imbalanced, test_data) %>%
ggplot(aes(x=correct, fill=grp)) +
geom_histogram(stat="count", position='dodge') +
labs(title="Imbalanced Split")
rbind(train_data.imbalanced, test_data) %>%
ggplot(aes(x=cr, fill=grp)) +
geom_density(stat="count", position='dodge')
control <- trainControl(
method = "cv",
number = 5,
verboseIter = T
)
pls.new_model <- train(
cr ~ t+l+s+n+tl+ll+sl+ul+vl,
data = train_data,
method = "pls",
preProcess = c("zv","center","scale"),
trControl = control
)
pls.old_model <- train(
dp ~ n*t,
data = train_data,
method = "pls",
preProcess = c("center","scale"),
trControl = control
)
summary(pls.old_model)
summary(pls.new_model)
plot(varImp(pls.old_model))
plot(varImp(pls.new_model), main="Criterion - Variable Importance")
trellis.par.set(caretTheme())
densityplot(pls.new_model, pch = "|")
densityplot(pls.old_model, pch = "|")
resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
summary(resamps)
dotplot(resamps, metric = "Rsquared")
difValues <- diff(resamps)
bwplot(difValues, layout=c(1,3))
pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
summary(pls.new_model)
summary(pls.old_model)
# SSE and RMSE
SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors
SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
R_square <- 1 - SSE/SST
SSE <- sum((test_data$cr - pls.new_predicted)^2)
RMSE <- sqrt(SSE/length(pls.new_predicted))
SSE <- sum((test_data$cr - pls.old_predicted)^2)
R_square <- 1 - SSE/SST
SSE <- sum((test_data$cr - pls.old_predicted)^2)
RMSE <- sqrt(SSE/length(pls.old_predicted))
as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$cr)) %>%
ggplot(aes(predicted, observed)) +
#coord_cartesian(xlim = c(-5, -2.8), ylim = c(-7, 0)) +
geom_point(alpha = 0.1,shape=16) +
geom_smooth(method=lm,se=F) +
ggtitle("Criterion: Predicted vs Actual") +
xlab("Predecited") +
ylab("Observed")