#==================================================# # model the "accuract" column (a for global, and al for local accuracy) library(here) library(tidyverse) library(caret) library(inspectdf) library(ROSE) load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) inspect_num(seqs.imputed) seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] train_data.imbalanced <- ROSE(correct ~ ., data = train_data, seed = 1)$data control <- trainControl( method = "repeatedcv", number = 5, repeats = 2, verboseIter = T ) pls.new_model <- train( a ~ .-al-dp-cr-rt-correct, data = train_data.imbalanced, method = "pls", preProcess = c("center","scale"), trControl = control ) plot(varImp(pls.new_model), main="Variable Importance for Accuracy") pls.old_model <- train( a ~ t + n + v, data = train_data, method = "pls", preProcess = c("center","scale"), trControl = control ) pls.old_model pls.new_model plot(varImp(pls.old_model)) trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") densityplot(pls.old_model, pch = "|") resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) summary(resamps) dotplot(resamps, metric = "Rsquared") difValues <- diff(resamps) bwplot(difValues, layout=c(1,3)) pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") pls.new_predicted <- predict(pls.new_model, test_data, type="raw") pls.old_predicted <- predict(pls.old_model, test_data, type="raw") summary(pls.new_model) # SSE and RMSE SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here R_square <- 1 - SSE/SST SSE <- sum((test_data$a - pls.new_predicted)^2) RMSE <- sqrt(SSE/length(pls.new_predicted)) SSE <- sum((test_data$a - pls.old_predicted)^2) R_square <- 1 - SSE/SST SSE <- sum((test_data$a - pls.old_predicted)^2) RMSE <- sqrt(SSE/length(pls.old_predicted)) as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% ggplot(aes(predicted, observed)) + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + geom_point(alpha = 0.1,shape=16) + geom_smooth(method=lm,se=F) + ggtitle("Accuracy: Predicted vs Actual") + xlab("Predecited") + ylab("Observed")