diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R index 0a2d999..adfb442 100644 --- a/ccn2019-accuracy.R +++ b/ccn2019-accuracy.R @@ -5,22 +5,22 @@ library(tidyverse) library(caret) library(inspectdf) +library(ROSE) - -load(here("data/nback_seqs.Rd")) +load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - select(-correct) + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) inspect_num(seqs.imputed) seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) -train_indexes <- createDataPartition(seqs.imputed$a, +train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) @@ -28,20 +28,28 @@ train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, + repeats = 2, verboseIter = T ) pls.new_model <- train( - a ~ t + l + s + v + n + tl + ll + sl + ul + vl, - data = train_data, + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, method = "pls", preProcess = c("center","scale"), trControl = control ) +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + pls.old_model <- train( a ~ t + n + v, data = train_data, @@ -53,8 +61,7 @@ pls.old_model pls.new_model -varImp(pls.old_model) -varImp(pls.new_model) +plot(varImp(pls.old_model)) trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R index 0a2d999..adfb442 100644 --- a/ccn2019-accuracy.R +++ b/ccn2019-accuracy.R @@ -5,22 +5,22 @@ library(tidyverse) library(caret) library(inspectdf) +library(ROSE) - -load(here("data/nback_seqs.Rd")) +load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - select(-correct) + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) inspect_num(seqs.imputed) seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) -train_indexes <- createDataPartition(seqs.imputed$a, +train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) @@ -28,20 +28,28 @@ train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, + repeats = 2, verboseIter = T ) pls.new_model <- train( - a ~ t + l + s + v + n + tl + ll + sl + ul + vl, - data = train_data, + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, method = "pls", preProcess = c("center","scale"), trControl = control ) +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + pls.old_model <- train( a ~ t + n + v, data = train_data, @@ -53,8 +61,7 @@ pls.old_model pls.new_model -varImp(pls.old_model) -varImp(pls.new_model) +plot(varImp(pls.old_model)) trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") diff --git a/ccn2019-correct.R b/ccn2019-correct.R index a07e4e5..9b9d9fb 100644 --- a/ccn2019-correct.R +++ b/ccn2019-correct.R @@ -95,28 +95,53 @@ par(pty="s") roc(test_data$correct, - pls.new_predicted_prob$CORRECT, + pls.common_predicted_prob$CORRECT, plot = T, legacy.axes=T, - lwd=4, - col="black", + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, print.auc.y = 45, + print.auc.x = 55, percent = T, - print.auc=T) + ci = T, + boot.n = 100 + ) -plot.roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - legacy.axes=T, - lwd=4, - col="darkgray", - print.auc=T, - percent = T, - print.auc.y = 40, - lty = 3, - add=T) + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,3),lwd=3, cex=0.8) + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) # requires plotROC package #DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R index 0a2d999..adfb442 100644 --- a/ccn2019-accuracy.R +++ b/ccn2019-accuracy.R @@ -5,22 +5,22 @@ library(tidyverse) library(caret) library(inspectdf) +library(ROSE) - -load(here("data/nback_seqs.Rd")) +load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - select(-correct) + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) inspect_num(seqs.imputed) seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) -train_indexes <- createDataPartition(seqs.imputed$a, +train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) @@ -28,20 +28,28 @@ train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, + repeats = 2, verboseIter = T ) pls.new_model <- train( - a ~ t + l + s + v + n + tl + ll + sl + ul + vl, - data = train_data, + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, method = "pls", preProcess = c("center","scale"), trControl = control ) +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + pls.old_model <- train( a ~ t + n + v, data = train_data, @@ -53,8 +61,7 @@ pls.old_model pls.new_model -varImp(pls.old_model) -varImp(pls.new_model) +plot(varImp(pls.old_model)) trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") diff --git a/ccn2019-correct.R b/ccn2019-correct.R index a07e4e5..9b9d9fb 100644 --- a/ccn2019-correct.R +++ b/ccn2019-correct.R @@ -95,28 +95,53 @@ par(pty="s") roc(test_data$correct, - pls.new_predicted_prob$CORRECT, + pls.common_predicted_prob$CORRECT, plot = T, legacy.axes=T, - lwd=4, - col="black", + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, print.auc.y = 45, + print.auc.x = 55, percent = T, - print.auc=T) + ci = T, + boot.n = 100 + ) -plot.roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - legacy.axes=T, - lwd=4, - col="darkgray", - print.auc=T, - percent = T, - print.auc.y = 40, - lty = 3, - add=T) + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,3),lwd=3, cex=0.8) + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) # requires plotROC package #DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R index 66c66a0..5bcefba 100644 --- a/ccn2019-criterion.R +++ b/ccn2019-criterion.R @@ -43,77 +43,77 @@ geom_histogram(stat="count", position='dodge') + labs(title="Imbalanced Split") -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=cr, fill=grp)) + - geom_density(stat="count", position='dodge') - control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, - verboseIter = T + repeats = 2, + verboseIter = T, + savePredictions = T ) +train_data <- train_data.imbalanced %>% select(-grp) + pls.new_model <- train( - cr ~ t+l+s+n+tl+ll+sl+ul+vl, + cr ~ .-a-al-dp-rt-correct, data = train_data, method = "pls", preProcess = c("zv","center","scale"), trControl = control ) -pls.old_model <- train( - dp ~ n*t, +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, data = train_data, method = "pls", - preProcess = c("center","scale"), + preProcess = c("zv","center","scale"), trControl = control ) -summary(pls.old_model) -summary(pls.new_model) -plot(varImp(pls.old_model)) -plot(varImp(pls.new_model), main="Criterion - Variable Importance") +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") +densityplot(pls.common_model, pch = "|") -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) -summary(pls.old_model) +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") # SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# -SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$cr - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$cr)) %>% +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% ggplot(aes(predicted, observed)) + - #coord_cartesian(xlim = c(-5, -2.8), ylim = c(-7, 0)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + + geom_smooth(method=glm) + ggtitle("Criterion: Predicted vs Actual") + xlab("Predecited") + ylab("Observed") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R index 0a2d999..adfb442 100644 --- a/ccn2019-accuracy.R +++ b/ccn2019-accuracy.R @@ -5,22 +5,22 @@ library(tidyverse) library(caret) library(inspectdf) +library(ROSE) - -load(here("data/nback_seqs.Rd")) +load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - select(-correct) + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) inspect_num(seqs.imputed) seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) -train_indexes <- createDataPartition(seqs.imputed$a, +train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) @@ -28,20 +28,28 @@ train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, + repeats = 2, verboseIter = T ) pls.new_model <- train( - a ~ t + l + s + v + n + tl + ll + sl + ul + vl, - data = train_data, + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, method = "pls", preProcess = c("center","scale"), trControl = control ) +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + pls.old_model <- train( a ~ t + n + v, data = train_data, @@ -53,8 +61,7 @@ pls.old_model pls.new_model -varImp(pls.old_model) -varImp(pls.new_model) +plot(varImp(pls.old_model)) trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") diff --git a/ccn2019-correct.R b/ccn2019-correct.R index a07e4e5..9b9d9fb 100644 --- a/ccn2019-correct.R +++ b/ccn2019-correct.R @@ -95,28 +95,53 @@ par(pty="s") roc(test_data$correct, - pls.new_predicted_prob$CORRECT, + pls.common_predicted_prob$CORRECT, plot = T, legacy.axes=T, - lwd=4, - col="black", + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, print.auc.y = 45, + print.auc.x = 55, percent = T, - print.auc=T) + ci = T, + boot.n = 100 + ) -plot.roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - legacy.axes=T, - lwd=4, - col="darkgray", - print.auc=T, - percent = T, - print.auc.y = 40, - lty = 3, - add=T) + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,3),lwd=3, cex=0.8) + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) # requires plotROC package #DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R index 66c66a0..5bcefba 100644 --- a/ccn2019-criterion.R +++ b/ccn2019-criterion.R @@ -43,77 +43,77 @@ geom_histogram(stat="count", position='dodge') + labs(title="Imbalanced Split") -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=cr, fill=grp)) + - geom_density(stat="count", position='dodge') - control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, - verboseIter = T + repeats = 2, + verboseIter = T, + savePredictions = T ) +train_data <- train_data.imbalanced %>% select(-grp) + pls.new_model <- train( - cr ~ t+l+s+n+tl+ll+sl+ul+vl, + cr ~ .-a-al-dp-rt-correct, data = train_data, method = "pls", preProcess = c("zv","center","scale"), trControl = control ) -pls.old_model <- train( - dp ~ n*t, +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, data = train_data, method = "pls", - preProcess = c("center","scale"), + preProcess = c("zv","center","scale"), trControl = control ) -summary(pls.old_model) -summary(pls.new_model) -plot(varImp(pls.old_model)) -plot(varImp(pls.new_model), main="Criterion - Variable Importance") +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") +densityplot(pls.common_model, pch = "|") -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) -summary(pls.old_model) +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") # SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# -SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$cr - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$cr)) %>% +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% ggplot(aes(predicted, observed)) + - #coord_cartesian(xlim = c(-5, -2.8), ylim = c(-7, 0)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + + geom_smooth(method=glm) + ggtitle("Criterion: Predicted vs Actual") + xlab("Predecited") + ylab("Observed") diff --git a/ccn2019-rt.R b/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R index 0a2d999..adfb442 100644 --- a/ccn2019-accuracy.R +++ b/ccn2019-accuracy.R @@ -5,22 +5,22 @@ library(tidyverse) library(caret) library(inspectdf) +library(ROSE) - -load(here("data/nback_seqs.Rd")) +load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - select(-correct) + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) inspect_num(seqs.imputed) seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) -train_indexes <- createDataPartition(seqs.imputed$a, +train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) @@ -28,20 +28,28 @@ train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, + repeats = 2, verboseIter = T ) pls.new_model <- train( - a ~ t + l + s + v + n + tl + ll + sl + ul + vl, - data = train_data, + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, method = "pls", preProcess = c("center","scale"), trControl = control ) +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + pls.old_model <- train( a ~ t + n + v, data = train_data, @@ -53,8 +61,7 @@ pls.old_model pls.new_model -varImp(pls.old_model) -varImp(pls.new_model) +plot(varImp(pls.old_model)) trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") diff --git a/ccn2019-correct.R b/ccn2019-correct.R index a07e4e5..9b9d9fb 100644 --- a/ccn2019-correct.R +++ b/ccn2019-correct.R @@ -95,28 +95,53 @@ par(pty="s") roc(test_data$correct, - pls.new_predicted_prob$CORRECT, + pls.common_predicted_prob$CORRECT, plot = T, legacy.axes=T, - lwd=4, - col="black", + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, print.auc.y = 45, + print.auc.x = 55, percent = T, - print.auc=T) + ci = T, + boot.n = 100 + ) -plot.roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - legacy.axes=T, - lwd=4, - col="darkgray", - print.auc=T, - percent = T, - print.auc.y = 40, - lty = 3, - add=T) + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,3),lwd=3, cex=0.8) + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) # requires plotROC package #DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R index 66c66a0..5bcefba 100644 --- a/ccn2019-criterion.R +++ b/ccn2019-criterion.R @@ -43,77 +43,77 @@ geom_histogram(stat="count", position='dodge') + labs(title="Imbalanced Split") -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=cr, fill=grp)) + - geom_density(stat="count", position='dodge') - control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, - verboseIter = T + repeats = 2, + verboseIter = T, + savePredictions = T ) +train_data <- train_data.imbalanced %>% select(-grp) + pls.new_model <- train( - cr ~ t+l+s+n+tl+ll+sl+ul+vl, + cr ~ .-a-al-dp-rt-correct, data = train_data, method = "pls", preProcess = c("zv","center","scale"), trControl = control ) -pls.old_model <- train( - dp ~ n*t, +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, data = train_data, method = "pls", - preProcess = c("center","scale"), + preProcess = c("zv","center","scale"), trControl = control ) -summary(pls.old_model) -summary(pls.new_model) -plot(varImp(pls.old_model)) -plot(varImp(pls.new_model), main="Criterion - Variable Importance") +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") +densityplot(pls.common_model, pch = "|") -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) -summary(pls.old_model) +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") # SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# -SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$cr - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$cr)) %>% +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% ggplot(aes(predicted, observed)) + - #coord_cartesian(xlim = c(-5, -2.8), ylim = c(-7, 0)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + + geom_smooth(method=glm) + ggtitle("Criterion: Predicted vs Actual") + xlab("Predecited") + ylab("Observed") diff --git a/ccn2019-rt.R b/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd index c0f15cd..1125fa1 100644 --- a/ccn2019.rev3.Rmd +++ b/ccn2019.rev3.Rmd @@ -20,7 +20,7 @@ #! =============================================== #! load data set and set running window size -load(here('data/CL2015.RData')) +load(here('notebooks/data/CL2015.RData')) window_size <- 8 ``` @@ -71,7 +71,7 @@ mutate(cr = map_dbl(local_stats, ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% ungroup() %>% select(-participant,-block,-condition) %>% @@ -80,7 +80,7 @@ #! =============================================== #! visualize correlations #DEBUG inspect_cor(seqs, show_plot = T) -save(seqs,file=here("data/nback_seqs.Rd")) +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) ``` ```{r remove_highly_correlated_predictors} @@ -91,114 +91,3 @@ #FIXME remove by column name seqs.uncorr <- seqs %>% select(-high_cor_remove) ``` - -```{r models} -#! =============================================== -#! prepare data for modeling (remove na, etc) -seqs <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(as.numeric(correct),labels=c("INCORRECT","CORRECT"))) - #mutate(correct=as.numeric(correct)) - -#FIXME remove outcomes before dummy out the data and imputing missing values - -# replace factors with dummy data -#seqs.dummy <- predict(dummyVars(~.,data=seqs[,-1]),seqs[,-1]) - -# impute missing values -#seqs.imputed <- predict(preProcess(seqs.dummy, "bagImpute"), seqs.dummy) -#DEBUG View(seqs.imputed) - -#! =============================================== -#! split into train/test (consider class-inbalence for the correct) -set.seed(42) -train_indexes <- createDataPartition(seqs$correct, - times = 1, - p = 0.7, - list = F) - -seqs.train <- seqs[train_indexes,] -seqs.test <- seqs[-train_indexes,] - -#! =============================================== -#! training parameters for the PLS models -plsTrControl <- trainControl( - method = "cv", - number = 5, - verboseIter = TRUE - ) - -#==================================================# -# Train PLS model (block-level accuracy) -pls.fit.accuracy <- train( - a ~ .-rt-al-correct-dp-cr, - data = seqs.train, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.accuracy) - -# PLS variable importance -plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance") - -pls.fit.accuracy.predicted <- predict(pls.fit.accuracy, seqs.test, type="raw") - -ggplot(seqs.test, aes(pls.fit.accuracy.predicted, a)) + - geom_point() - - -#==================================================# -# Train PLS model (rt) -train_data_x <- seqs.train %>% select(-rt,-a,-correct) # all except rt -train_data_y <- (seqs.train %>% select(rt))$rt # only rt - -pls.fit.rt <- train( - train_data_x, - train_data_y, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.rt) - -# PLS variable importance -plot(varImp(pls.fit.rt), main="RT - Variable Importance") - -pls.predicted.rt <- predict(pls.fit.rt, seqs.test) - -#FIXME measure performance with accuracy, etc. -#confusionMatrix(pls.predicted.rt, seqs.test$rt) -#colAUC(pls.predicted.rt, seqs.test$rt, plotROC=T) - -#==================================================# -## OLD MODEL (only global features) - -pls.fit.accuracy.old <- train( - a ~ n+t+v, - data = seqs.train, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.accuracy.old) - -# PLS variable importance -plot(varImp(pls.fit.accuracy.old), main="Accuracy (old) - Variable Importance") - -pls.fit.accuracy.old.predicted <- predict(pls.fit.accuracy.old, seqs.test, type="raw") - -ggplot(seqs.test, aes(pls.fit.accuracy.old.predicted, a)) + - geom_point() - -#FIXME -confusionMatrix(glm.fit.predicted.old, test_data$correct, ) -colAUC(glm.fit.predicted.old, test_data$correct, plotROC=T) -#TODO use pROC to viz AUX roc(test_data$correct,glm.fit.predicted.old) -``` diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R index 0a2d999..adfb442 100644 --- a/ccn2019-accuracy.R +++ b/ccn2019-accuracy.R @@ -5,22 +5,22 @@ library(tidyverse) library(caret) library(inspectdf) +library(ROSE) - -load(here("data/nback_seqs.Rd")) +load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - select(-correct) + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) inspect_num(seqs.imputed) seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) -train_indexes <- createDataPartition(seqs.imputed$a, +train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) @@ -28,20 +28,28 @@ train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, + repeats = 2, verboseIter = T ) pls.new_model <- train( - a ~ t + l + s + v + n + tl + ll + sl + ul + vl, - data = train_data, + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, method = "pls", preProcess = c("center","scale"), trControl = control ) +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + pls.old_model <- train( a ~ t + n + v, data = train_data, @@ -53,8 +61,7 @@ pls.old_model pls.new_model -varImp(pls.old_model) -varImp(pls.new_model) +plot(varImp(pls.old_model)) trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") diff --git a/ccn2019-correct.R b/ccn2019-correct.R index a07e4e5..9b9d9fb 100644 --- a/ccn2019-correct.R +++ b/ccn2019-correct.R @@ -95,28 +95,53 @@ par(pty="s") roc(test_data$correct, - pls.new_predicted_prob$CORRECT, + pls.common_predicted_prob$CORRECT, plot = T, legacy.axes=T, - lwd=4, - col="black", + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, print.auc.y = 45, + print.auc.x = 55, percent = T, - print.auc=T) + ci = T, + boot.n = 100 + ) -plot.roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - legacy.axes=T, - lwd=4, - col="darkgray", - print.auc=T, - percent = T, - print.auc.y = 40, - lty = 3, - add=T) + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,3),lwd=3, cex=0.8) + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) # requires plotROC package #DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R index 66c66a0..5bcefba 100644 --- a/ccn2019-criterion.R +++ b/ccn2019-criterion.R @@ -43,77 +43,77 @@ geom_histogram(stat="count", position='dodge') + labs(title="Imbalanced Split") -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=cr, fill=grp)) + - geom_density(stat="count", position='dodge') - control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, - verboseIter = T + repeats = 2, + verboseIter = T, + savePredictions = T ) +train_data <- train_data.imbalanced %>% select(-grp) + pls.new_model <- train( - cr ~ t+l+s+n+tl+ll+sl+ul+vl, + cr ~ .-a-al-dp-rt-correct, data = train_data, method = "pls", preProcess = c("zv","center","scale"), trControl = control ) -pls.old_model <- train( - dp ~ n*t, +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, data = train_data, method = "pls", - preProcess = c("center","scale"), + preProcess = c("zv","center","scale"), trControl = control ) -summary(pls.old_model) -summary(pls.new_model) -plot(varImp(pls.old_model)) -plot(varImp(pls.new_model), main="Criterion - Variable Importance") +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") +densityplot(pls.common_model, pch = "|") -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) -summary(pls.old_model) +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") # SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# -SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$cr - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$cr)) %>% +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% ggplot(aes(predicted, observed)) + - #coord_cartesian(xlim = c(-5, -2.8), ylim = c(-7, 0)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + + geom_smooth(method=glm) + ggtitle("Criterion: Predicted vs Actual") + xlab("Predecited") + ylab("Observed") diff --git a/ccn2019-rt.R b/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd index c0f15cd..1125fa1 100644 --- a/ccn2019.rev3.Rmd +++ b/ccn2019.rev3.Rmd @@ -20,7 +20,7 @@ #! =============================================== #! load data set and set running window size -load(here('data/CL2015.RData')) +load(here('notebooks/data/CL2015.RData')) window_size <- 8 ``` @@ -71,7 +71,7 @@ mutate(cr = map_dbl(local_stats, ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% ungroup() %>% select(-participant,-block,-condition) %>% @@ -80,7 +80,7 @@ #! =============================================== #! visualize correlations #DEBUG inspect_cor(seqs, show_plot = T) -save(seqs,file=here("data/nback_seqs.Rd")) +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) ``` ```{r remove_highly_correlated_predictors} @@ -91,114 +91,3 @@ #FIXME remove by column name seqs.uncorr <- seqs %>% select(-high_cor_remove) ``` - -```{r models} -#! =============================================== -#! prepare data for modeling (remove na, etc) -seqs <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(as.numeric(correct),labels=c("INCORRECT","CORRECT"))) - #mutate(correct=as.numeric(correct)) - -#FIXME remove outcomes before dummy out the data and imputing missing values - -# replace factors with dummy data -#seqs.dummy <- predict(dummyVars(~.,data=seqs[,-1]),seqs[,-1]) - -# impute missing values -#seqs.imputed <- predict(preProcess(seqs.dummy, "bagImpute"), seqs.dummy) -#DEBUG View(seqs.imputed) - -#! =============================================== -#! split into train/test (consider class-inbalence for the correct) -set.seed(42) -train_indexes <- createDataPartition(seqs$correct, - times = 1, - p = 0.7, - list = F) - -seqs.train <- seqs[train_indexes,] -seqs.test <- seqs[-train_indexes,] - -#! =============================================== -#! training parameters for the PLS models -plsTrControl <- trainControl( - method = "cv", - number = 5, - verboseIter = TRUE - ) - -#==================================================# -# Train PLS model (block-level accuracy) -pls.fit.accuracy <- train( - a ~ .-rt-al-correct-dp-cr, - data = seqs.train, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.accuracy) - -# PLS variable importance -plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance") - -pls.fit.accuracy.predicted <- predict(pls.fit.accuracy, seqs.test, type="raw") - -ggplot(seqs.test, aes(pls.fit.accuracy.predicted, a)) + - geom_point() - - -#==================================================# -# Train PLS model (rt) -train_data_x <- seqs.train %>% select(-rt,-a,-correct) # all except rt -train_data_y <- (seqs.train %>% select(rt))$rt # only rt - -pls.fit.rt <- train( - train_data_x, - train_data_y, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.rt) - -# PLS variable importance -plot(varImp(pls.fit.rt), main="RT - Variable Importance") - -pls.predicted.rt <- predict(pls.fit.rt, seqs.test) - -#FIXME measure performance with accuracy, etc. -#confusionMatrix(pls.predicted.rt, seqs.test$rt) -#colAUC(pls.predicted.rt, seqs.test$rt, plotROC=T) - -#==================================================# -## OLD MODEL (only global features) - -pls.fit.accuracy.old <- train( - a ~ n+t+v, - data = seqs.train, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.accuracy.old) - -# PLS variable importance -plot(varImp(pls.fit.accuracy.old), main="Accuracy (old) - Variable Importance") - -pls.fit.accuracy.old.predicted <- predict(pls.fit.accuracy.old, seqs.test, type="raw") - -ggplot(seqs.test, aes(pls.fit.accuracy.old.predicted, a)) + - geom_point() - -#FIXME -confusionMatrix(glm.fit.predicted.old, test_data$correct, ) -colAUC(glm.fit.predicted.old, test_data$correct, plotROC=T) -#TODO use pROC to viz AUX roc(test_data$correct,glm.fit.predicted.old) -``` diff --git a/data/nback_seqs.Rd b/data/nback_seqs.Rd index a355010..d2911cc 100644 --- a/data/nback_seqs.Rd +++ b/data/nback_seqs.Rd Binary files differ diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R index 0a2d999..adfb442 100644 --- a/ccn2019-accuracy.R +++ b/ccn2019-accuracy.R @@ -5,22 +5,22 @@ library(tidyverse) library(caret) library(inspectdf) +library(ROSE) - -load(here("data/nback_seqs.Rd")) +load(here("notebooks/data/nback_seqs.Rd")) set.seed(42) seqs.imputed <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - select(-correct) + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) inspect_num(seqs.imputed) seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) -train_indexes <- createDataPartition(seqs.imputed$a, +train_indexes <- createDataPartition(seqs.imputed$correct, times = 1, p = 0.7, list = F) @@ -28,20 +28,28 @@ train_data <- seqs.imputed[train_indexes,] test_data <- seqs.imputed[-train_indexes,] +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, + repeats = 2, verboseIter = T ) pls.new_model <- train( - a ~ t + l + s + v + n + tl + ll + sl + ul + vl, - data = train_data, + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, method = "pls", preProcess = c("center","scale"), trControl = control ) +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + pls.old_model <- train( a ~ t + n + v, data = train_data, @@ -53,8 +61,7 @@ pls.old_model pls.new_model -varImp(pls.old_model) -varImp(pls.new_model) +plot(varImp(pls.old_model)) trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") diff --git a/ccn2019-correct.R b/ccn2019-correct.R index a07e4e5..9b9d9fb 100644 --- a/ccn2019-correct.R +++ b/ccn2019-correct.R @@ -95,28 +95,53 @@ par(pty="s") roc(test_data$correct, - pls.new_predicted_prob$CORRECT, + pls.common_predicted_prob$CORRECT, plot = T, legacy.axes=T, - lwd=4, - col="black", + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, print.auc.y = 45, + print.auc.x = 55, percent = T, - print.auc=T) + ci = T, + boot.n = 100 + ) -plot.roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - legacy.axes=T, - lwd=4, - col="darkgray", - print.auc=T, - percent = T, - print.auc.y = 40, - lty = 3, - add=T) + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,3),lwd=3, cex=0.8) + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) # requires plotROC package #DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R index 66c66a0..5bcefba 100644 --- a/ccn2019-criterion.R +++ b/ccn2019-criterion.R @@ -43,77 +43,77 @@ geom_histogram(stat="count", position='dodge') + labs(title="Imbalanced Split") -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=cr, fill=grp)) + - geom_density(stat="count", position='dodge') - control <- trainControl( - method = "cv", + method = "repeatedcv", number = 5, - verboseIter = T + repeats = 2, + verboseIter = T, + savePredictions = T ) +train_data <- train_data.imbalanced %>% select(-grp) + pls.new_model <- train( - cr ~ t+l+s+n+tl+ll+sl+ul+vl, + cr ~ .-a-al-dp-rt-correct, data = train_data, method = "pls", preProcess = c("zv","center","scale"), trControl = control ) -pls.old_model <- train( - dp ~ n*t, +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, data = train_data, method = "pls", - preProcess = c("center","scale"), + preProcess = c("zv","center","scale"), trControl = control ) -summary(pls.old_model) -summary(pls.new_model) -plot(varImp(pls.old_model)) -plot(varImp(pls.new_model), main="Criterion - Variable Importance") +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") trellis.par.set(caretTheme()) densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") +densityplot(pls.common_model, pch = "|") -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) -summary(pls.old_model) +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") # SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# -SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$cr - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$cr - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$cr)) %>% +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% ggplot(aes(predicted, observed)) + - #coord_cartesian(xlim = c(-5, -2.8), ylim = c(-7, 0)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + + geom_smooth(method=glm) + ggtitle("Criterion: Predicted vs Actual") + xlab("Predecited") + ylab("Observed") diff --git a/ccn2019-rt.R b/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd index c0f15cd..1125fa1 100644 --- a/ccn2019.rev3.Rmd +++ b/ccn2019.rev3.Rmd @@ -20,7 +20,7 @@ #! =============================================== #! load data set and set running window size -load(here('data/CL2015.RData')) +load(here('notebooks/data/CL2015.RData')) window_size <- 8 ``` @@ -71,7 +71,7 @@ mutate(cr = map_dbl(local_stats, ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% ungroup() %>% select(-participant,-block,-condition) %>% @@ -80,7 +80,7 @@ #! =============================================== #! visualize correlations #DEBUG inspect_cor(seqs, show_plot = T) -save(seqs,file=here("data/nback_seqs.Rd")) +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) ``` ```{r remove_highly_correlated_predictors} @@ -91,114 +91,3 @@ #FIXME remove by column name seqs.uncorr <- seqs %>% select(-high_cor_remove) ``` - -```{r models} -#! =============================================== -#! prepare data for modeling (remove na, etc) -seqs <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(as.numeric(correct),labels=c("INCORRECT","CORRECT"))) - #mutate(correct=as.numeric(correct)) - -#FIXME remove outcomes before dummy out the data and imputing missing values - -# replace factors with dummy data -#seqs.dummy <- predict(dummyVars(~.,data=seqs[,-1]),seqs[,-1]) - -# impute missing values -#seqs.imputed <- predict(preProcess(seqs.dummy, "bagImpute"), seqs.dummy) -#DEBUG View(seqs.imputed) - -#! =============================================== -#! split into train/test (consider class-inbalence for the correct) -set.seed(42) -train_indexes <- createDataPartition(seqs$correct, - times = 1, - p = 0.7, - list = F) - -seqs.train <- seqs[train_indexes,] -seqs.test <- seqs[-train_indexes,] - -#! =============================================== -#! training parameters for the PLS models -plsTrControl <- trainControl( - method = "cv", - number = 5, - verboseIter = TRUE - ) - -#==================================================# -# Train PLS model (block-level accuracy) -pls.fit.accuracy <- train( - a ~ .-rt-al-correct-dp-cr, - data = seqs.train, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.accuracy) - -# PLS variable importance -plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance") - -pls.fit.accuracy.predicted <- predict(pls.fit.accuracy, seqs.test, type="raw") - -ggplot(seqs.test, aes(pls.fit.accuracy.predicted, a)) + - geom_point() - - -#==================================================# -# Train PLS model (rt) -train_data_x <- seqs.train %>% select(-rt,-a,-correct) # all except rt -train_data_y <- (seqs.train %>% select(rt))$rt # only rt - -pls.fit.rt <- train( - train_data_x, - train_data_y, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.rt) - -# PLS variable importance -plot(varImp(pls.fit.rt), main="RT - Variable Importance") - -pls.predicted.rt <- predict(pls.fit.rt, seqs.test) - -#FIXME measure performance with accuracy, etc. -#confusionMatrix(pls.predicted.rt, seqs.test$rt) -#colAUC(pls.predicted.rt, seqs.test$rt, plotROC=T) - -#==================================================# -## OLD MODEL (only global features) - -pls.fit.accuracy.old <- train( - a ~ n+t+v, - data = seqs.train, - method = "pls", - tuneLength = 20, - trControl = plsTrControl, - preProc = c("center","scale")) - -# Check CV profile -plot(pls.fit.accuracy.old) - -# PLS variable importance -plot(varImp(pls.fit.accuracy.old), main="Accuracy (old) - Variable Importance") - -pls.fit.accuracy.old.predicted <- predict(pls.fit.accuracy.old, seqs.test, type="raw") - -ggplot(seqs.test, aes(pls.fit.accuracy.old.predicted, a)) + - geom_point() - -#FIXME -confusionMatrix(glm.fit.predicted.old, test_data$correct, ) -colAUC(glm.fit.predicted.old, test_data$correct, plotROC=T) -#TODO use pROC to viz AUX roc(test_data$correct,glm.fit.predicted.old) -``` diff --git a/data/nback_seqs.Rd b/data/nback_seqs.Rd index a355010..d2911cc 100644 --- a/data/nback_seqs.Rd +++ b/data/nback_seqs.Rd Binary files differ diff --git a/dummy-vars-playground.R b/dummy-vars-playground.R new file mode 100644 index 0000000..3db3bc1 --- /dev/null +++ b/dummy-vars-playground.R @@ -0,0 +1,24 @@ +library(tidyverse) +library(caret) +library(here) + + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.dummy <- data.frame(predict(dummyVars(~.,seqs,fullRank = T),seqs)) %>% + filter(!is.na(correctTRUE), !is.na(rt)) + +inspect_num(seqs.dummy) +inspect_na(seqs.dummy,show_plot = T) + +#inspect_cor(seqs.dummy,show_plot = T) + +cor_matrix <- cor(seqs.dummy) +cor_high <- findCorrelation(cor_matrix, 0.5) +high_cor_remove <- row.names(cor_matrix)[cor_high] +#FIXME remove by column name +seqs.uncorr <- seqs.dummy %>% select(-high_cor_remove) +``` \ No newline at end of file