#==================================================# # model the "RT" column library(here) library(tidyverse) library(caret) library(inspectdf) library(skimr) library(ROSE) load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) #DEBUG inspect_num(seqs.imputed) #seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) #DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] train_data.imbalanced <- ROSE(correct ~ ., data = train_data, seed = 1)$data # VIsualize split train_data.imbalanced$grp <- "train" test_data$grp <- "test" rbind(train_data.imbalanced, test_data) %>% ggplot(aes(x=correct, fill=grp)) + geom_histogram(stat="count", position='dodge') + labs(title="Imbalanced Split") control <- trainControl( method = "repeatedcv", number = 5, repeats = 2, verboseIter = T, savePredictions = T ) train_data <- train_data.imbalanced %>% select(-grp) pls.new_model <- train( rt ~ .-a-al-cr-dp-rt-correct, data = train_data, method = "pls", preProcess = c("zv","center","scale"), trControl = control ) plot(pls.new_model) summary(pls.new_model) plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") pls.common_model <- train( cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, data = train_data, method = "pls", preProcess = c("zv","center","scale"), trControl = control ) summary(pls.common_model) plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") densityplot(pls.common_model, pch = "|") pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) summary(pls.models) dotplot(pls.models, metric = "Rsquared") diffValues <- diff(pls.models) bwplot(diffValues, layout=c(1,3)) pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") pls.new_predicted <- predict(pls.new_model, test_data, type="raw") pls.common_predicted <- predict(pls.common_model, test_data, type="raw") # SSE and RMSE # # SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors # SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here # R_square <- 1 - SSE/SST # SSE <- sum((test_data$cr - pls.new_predicted)^2) # RMSE <- sqrt(SSE/length(pls.new_predicted)) # # # SSE <- sum((test_data$cr - pls.common_predicted)^2) # R_square <- 1 - SSE/SST # SSE <- sum((test_data$cr - pls.common_predicted)^2) # RMSE <- sqrt(SSE/length(pls.common_predicted)) # as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% ggplot(aes(predicted, observed)) + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + geom_point(alpha = 0.1,shape=16) + geom_smooth(method=glm) + ggtitle("Criterion: Predicted vs Actual") + xlab("Predecited") + ylab("Observed")