diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd index 01ebefb..468f63b 100644 --- a/ccn2019.rev3.Rmd +++ b/ccn2019.rev3.Rmd @@ -14,17 +14,18 @@ library(inspectdf) library(caTools) library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('data/CL2015.RData')) +window_size <- 8 + ``` ```{r preprocessing} #! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -#! =============================================== #! A function to mark lures in a sequence with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { @@ -61,7 +62,14 @@ mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(dp = map_dbl(local_stats, + ~length(which(.x$stimulus_type=="target" & .x$correct)) - + length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + mutate(cr = map_dbl(local_stats, + ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% ungroup() %>% select(-participant,-block,-condition) %>% unnest(local_stats) @@ -69,6 +77,7 @@ #! =============================================== #! visualize correlations #DEBUG inspect_cor(seqs, show_plot = T) +save(seqs,file=here("data/nback_seqs.Rd")) ``` ```{r remove_highly_correlated_predictors} @@ -85,8 +94,8 @@ #! prepare data for modeling (remove na, etc) seqs <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - #mutate(correct=factor(as.numeric(correct),labels=c("I","C"))) - mutate(correct=as.numeric(correct)) + mutate(correct=factor(as.numeric(correct),labels=c("I","C"))) + #mutate(correct=as.numeric(correct)) #FIXME remove outcomes before dummy out the data and imputing missing values @@ -112,14 +121,15 @@ #! training parameters for the PLS models plsTrControl <- trainControl( method = "cv", - number = 5 + number = 5, + verboseIter = TRUE ) #==================================================# # Train PLS model (block-level accuracy) pls.fit.accuracy <- train( - a ~ .-rt-al-correct, - data = train_data, + a ~ .-rt-al-correct-dp-cr, + data = seqs.train, method = "pls", tuneLength = 20, trControl = plsTrControl, @@ -131,11 +141,16 @@ # PLS variable importance plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance") +pls.fit.accuracy.predicted <- predict(pls.fit.accuracy, seqs.test, type="raw") + +ggplot(seqs.test, aes(pls.fit.accuracy.predicted, a)) + + geom_point() + #==================================================# # Train PLS model (rt) -train_data_x <- train_data %>% select(-rt,-a,-correct) # all except rt -train_data_y <- (train_data %>% select(rt))$rt # only rt +train_data_x <- seqs.train %>% select(-rt,-a,-correct) # all except rt +train_data_y <- (seqs.train %>% select(rt))$rt # only rt pls.fit.rt <- train( train_data_x, @@ -151,52 +166,33 @@ # PLS variable importance plot(varImp(pls.fit.rt), main="RT - Variable Importance") -pls.predicted.rt <- predict(pls.fit.rt, test_data) +pls.predicted.rt <- predict(pls.fit.rt, seqs.test) -#FIXME -confusionMatrix(pls.predicted.rt,test_data$rt) -colAUC(pls.predicted.rt,test_data$rt, plotROC=T) - -#==================================================# -# training control params for "correct" column -glmTrControl <- trainControl( - method = "cv", - number = 5, - classProbs = T, - summaryFunction = twoClassSummary - ) - -glm.fit.correct <- train( - correct ~ .-rt-a-al, - data = train_data, - method = "glm", - family = "binomial", - trControl = glmTrControl -) - -glm.fit.correct -varImp(glm.fit.correct) - - -glm.predicted.correct <- predict(glm.fit.correct, test_data, type="prob") - -#FIXME -confusionMatrix(glm.predicted.correct, test_data$correct) -colAUC(glm.predicted.correct, test_data$correct, plotROC=T) +#FIXME measure performance with accuracy, etc. +#confusionMatrix(pls.predicted.rt, seqs.test$rt) +#colAUC(pls.predicted.rt, seqs.test$rt, plotROC=T) #==================================================# ## OLD MODEL (only global features) -glm.fit.correct.old <- train( - correct ~ n+t+v, - data = train_data, - method = "glm", - family = "binomial", - trControl = trControl -) -glm.fit.correct.old +pls.fit.accuracy.old <- train( + a ~ n+t+v, + data = seqs.train, + method = "pls", + tuneLength = 20, + trControl = plsTrControl, + preProc = c("center","scale")) + +# Check CV profile +plot(pls.fit.accuracy.old) -glm.fit.predicted.old <- predict(glm.fit.correct.old, test_data, type="prob") +# PLS variable importance +plot(varImp(pls.fit.accuracy.old), main="Accuracy (old) - Variable Importance") + +pls.fit.accuracy.old.predicted <- predict(pls.fit.accuracy.old, seqs.test, type="raw") + +ggplot(seqs.test, aes(pls.fit.accuracy.old.predicted, a)) + + geom_point() #FIXME confusionMatrix(glm.fit.predicted.old, test_data$correct, ) diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd index 01ebefb..468f63b 100644 --- a/ccn2019.rev3.Rmd +++ b/ccn2019.rev3.Rmd @@ -14,17 +14,18 @@ library(inspectdf) library(caTools) library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('data/CL2015.RData')) +window_size <- 8 + ``` ```{r preprocessing} #! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -#! =============================================== #! A function to mark lures in a sequence with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { @@ -61,7 +62,14 @@ mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(dp = map_dbl(local_stats, + ~length(which(.x$stimulus_type=="target" & .x$correct)) - + length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + mutate(cr = map_dbl(local_stats, + ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% ungroup() %>% select(-participant,-block,-condition) %>% unnest(local_stats) @@ -69,6 +77,7 @@ #! =============================================== #! visualize correlations #DEBUG inspect_cor(seqs, show_plot = T) +save(seqs,file=here("data/nback_seqs.Rd")) ``` ```{r remove_highly_correlated_predictors} @@ -85,8 +94,8 @@ #! prepare data for modeling (remove na, etc) seqs <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - #mutate(correct=factor(as.numeric(correct),labels=c("I","C"))) - mutate(correct=as.numeric(correct)) + mutate(correct=factor(as.numeric(correct),labels=c("I","C"))) + #mutate(correct=as.numeric(correct)) #FIXME remove outcomes before dummy out the data and imputing missing values @@ -112,14 +121,15 @@ #! training parameters for the PLS models plsTrControl <- trainControl( method = "cv", - number = 5 + number = 5, + verboseIter = TRUE ) #==================================================# # Train PLS model (block-level accuracy) pls.fit.accuracy <- train( - a ~ .-rt-al-correct, - data = train_data, + a ~ .-rt-al-correct-dp-cr, + data = seqs.train, method = "pls", tuneLength = 20, trControl = plsTrControl, @@ -131,11 +141,16 @@ # PLS variable importance plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance") +pls.fit.accuracy.predicted <- predict(pls.fit.accuracy, seqs.test, type="raw") + +ggplot(seqs.test, aes(pls.fit.accuracy.predicted, a)) + + geom_point() + #==================================================# # Train PLS model (rt) -train_data_x <- train_data %>% select(-rt,-a,-correct) # all except rt -train_data_y <- (train_data %>% select(rt))$rt # only rt +train_data_x <- seqs.train %>% select(-rt,-a,-correct) # all except rt +train_data_y <- (seqs.train %>% select(rt))$rt # only rt pls.fit.rt <- train( train_data_x, @@ -151,52 +166,33 @@ # PLS variable importance plot(varImp(pls.fit.rt), main="RT - Variable Importance") -pls.predicted.rt <- predict(pls.fit.rt, test_data) +pls.predicted.rt <- predict(pls.fit.rt, seqs.test) -#FIXME -confusionMatrix(pls.predicted.rt,test_data$rt) -colAUC(pls.predicted.rt,test_data$rt, plotROC=T) - -#==================================================# -# training control params for "correct" column -glmTrControl <- trainControl( - method = "cv", - number = 5, - classProbs = T, - summaryFunction = twoClassSummary - ) - -glm.fit.correct <- train( - correct ~ .-rt-a-al, - data = train_data, - method = "glm", - family = "binomial", - trControl = glmTrControl -) - -glm.fit.correct -varImp(glm.fit.correct) - - -glm.predicted.correct <- predict(glm.fit.correct, test_data, type="prob") - -#FIXME -confusionMatrix(glm.predicted.correct, test_data$correct) -colAUC(glm.predicted.correct, test_data$correct, plotROC=T) +#FIXME measure performance with accuracy, etc. +#confusionMatrix(pls.predicted.rt, seqs.test$rt) +#colAUC(pls.predicted.rt, seqs.test$rt, plotROC=T) #==================================================# ## OLD MODEL (only global features) -glm.fit.correct.old <- train( - correct ~ n+t+v, - data = train_data, - method = "glm", - family = "binomial", - trControl = trControl -) -glm.fit.correct.old +pls.fit.accuracy.old <- train( + a ~ n+t+v, + data = seqs.train, + method = "pls", + tuneLength = 20, + trControl = plsTrControl, + preProc = c("center","scale")) + +# Check CV profile +plot(pls.fit.accuracy.old) -glm.fit.predicted.old <- predict(glm.fit.correct.old, test_data, type="prob") +# PLS variable importance +plot(varImp(pls.fit.accuracy.old), main="Accuracy (old) - Variable Importance") + +pls.fit.accuracy.old.predicted <- predict(pls.fit.accuracy.old, seqs.test, type="raw") + +ggplot(seqs.test, aes(pls.fit.accuracy.old.predicted, a)) + + geom_point() #FIXME confusionMatrix(glm.fit.predicted.old, test_data$correct, ) diff --git a/cnn2019-correct.R b/cnn2019-correct.R new file mode 100644 index 0000000..a4e0b4c --- /dev/null +++ b/cnn2019-correct.R @@ -0,0 +1,83 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) + + +load(here("data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "cv", + number = 5, + classProbs = T, + verboseIter = T, + summaryFunction = twoClassSummary +) + +pls.model1 <- train( + correct ~ t + l + s + v + n + tl + ll + sl + ul + vl, + data = train_data, + method = "pls", + trControl = control +) + +pls.model2 <- train( + correct ~ t + n + v, + data = train_data, + method = "pls", + trControl = control +) + +pls.model1 +pls.model2 +varImp(pls.model1) +varImp(pls.model2) + +trellis.par.set(caretTheme()) +densityplot(pls.model1, pch = "|") +densityplot(pls.model2, pch = "|") + +resamps <- resamples(list(model1 = pls.model1, model2 = pls.model2)) +summary(resamps) +dotplot(resamps, metric = "ROC") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.train_predicted1 <- predict(pls.model1, train_data, type="prob") +pls.train_predicted2 <- predict(pls.model2, train_data, type="raw") +pls.predicted1 <- predict(pls.model1, test_data, type="raw") +pls.predicted2 <- predict(pls.model2, test_data, type="raw") + +#FIXME +confusionMatrix(pls.train_predicted1, train_data$correct) +confusionMatrix(pls.train_predicted2, train_data$correct) +confusionMatrix(pls.predicted1, test_data$correct) +confusionMatrix(pls.predicted2, test_data$correct) + +colAUC(pls.predicted1, test_data$correct, plotROC=T) +colAUC(pls.predicted2, test_data$correct, plotROC=T) + diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd index 01ebefb..468f63b 100644 --- a/ccn2019.rev3.Rmd +++ b/ccn2019.rev3.Rmd @@ -14,17 +14,18 @@ library(inspectdf) library(caTools) library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('data/CL2015.RData')) +window_size <- 8 + ``` ```{r preprocessing} #! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -#! =============================================== #! A function to mark lures in a sequence with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { @@ -61,7 +62,14 @@ mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(dp = map_dbl(local_stats, + ~length(which(.x$stimulus_type=="target" & .x$correct)) - + length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + mutate(cr = map_dbl(local_stats, + ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% ungroup() %>% select(-participant,-block,-condition) %>% unnest(local_stats) @@ -69,6 +77,7 @@ #! =============================================== #! visualize correlations #DEBUG inspect_cor(seqs, show_plot = T) +save(seqs,file=here("data/nback_seqs.Rd")) ``` ```{r remove_highly_correlated_predictors} @@ -85,8 +94,8 @@ #! prepare data for modeling (remove na, etc) seqs <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% - #mutate(correct=factor(as.numeric(correct),labels=c("I","C"))) - mutate(correct=as.numeric(correct)) + mutate(correct=factor(as.numeric(correct),labels=c("I","C"))) + #mutate(correct=as.numeric(correct)) #FIXME remove outcomes before dummy out the data and imputing missing values @@ -112,14 +121,15 @@ #! training parameters for the PLS models plsTrControl <- trainControl( method = "cv", - number = 5 + number = 5, + verboseIter = TRUE ) #==================================================# # Train PLS model (block-level accuracy) pls.fit.accuracy <- train( - a ~ .-rt-al-correct, - data = train_data, + a ~ .-rt-al-correct-dp-cr, + data = seqs.train, method = "pls", tuneLength = 20, trControl = plsTrControl, @@ -131,11 +141,16 @@ # PLS variable importance plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance") +pls.fit.accuracy.predicted <- predict(pls.fit.accuracy, seqs.test, type="raw") + +ggplot(seqs.test, aes(pls.fit.accuracy.predicted, a)) + + geom_point() + #==================================================# # Train PLS model (rt) -train_data_x <- train_data %>% select(-rt,-a,-correct) # all except rt -train_data_y <- (train_data %>% select(rt))$rt # only rt +train_data_x <- seqs.train %>% select(-rt,-a,-correct) # all except rt +train_data_y <- (seqs.train %>% select(rt))$rt # only rt pls.fit.rt <- train( train_data_x, @@ -151,52 +166,33 @@ # PLS variable importance plot(varImp(pls.fit.rt), main="RT - Variable Importance") -pls.predicted.rt <- predict(pls.fit.rt, test_data) +pls.predicted.rt <- predict(pls.fit.rt, seqs.test) -#FIXME -confusionMatrix(pls.predicted.rt,test_data$rt) -colAUC(pls.predicted.rt,test_data$rt, plotROC=T) - -#==================================================# -# training control params for "correct" column -glmTrControl <- trainControl( - method = "cv", - number = 5, - classProbs = T, - summaryFunction = twoClassSummary - ) - -glm.fit.correct <- train( - correct ~ .-rt-a-al, - data = train_data, - method = "glm", - family = "binomial", - trControl = glmTrControl -) - -glm.fit.correct -varImp(glm.fit.correct) - - -glm.predicted.correct <- predict(glm.fit.correct, test_data, type="prob") - -#FIXME -confusionMatrix(glm.predicted.correct, test_data$correct) -colAUC(glm.predicted.correct, test_data$correct, plotROC=T) +#FIXME measure performance with accuracy, etc. +#confusionMatrix(pls.predicted.rt, seqs.test$rt) +#colAUC(pls.predicted.rt, seqs.test$rt, plotROC=T) #==================================================# ## OLD MODEL (only global features) -glm.fit.correct.old <- train( - correct ~ n+t+v, - data = train_data, - method = "glm", - family = "binomial", - trControl = trControl -) -glm.fit.correct.old +pls.fit.accuracy.old <- train( + a ~ n+t+v, + data = seqs.train, + method = "pls", + tuneLength = 20, + trControl = plsTrControl, + preProc = c("center","scale")) + +# Check CV profile +plot(pls.fit.accuracy.old) -glm.fit.predicted.old <- predict(glm.fit.correct.old, test_data, type="prob") +# PLS variable importance +plot(varImp(pls.fit.accuracy.old), main="Accuracy (old) - Variable Importance") + +pls.fit.accuracy.old.predicted <- predict(pls.fit.accuracy.old, seqs.test, type="raw") + +ggplot(seqs.test, aes(pls.fit.accuracy.old.predicted, a)) + + geom_point() #FIXME confusionMatrix(glm.fit.predicted.old, test_data$correct, ) diff --git a/cnn2019-correct.R b/cnn2019-correct.R new file mode 100644 index 0000000..a4e0b4c --- /dev/null +++ b/cnn2019-correct.R @@ -0,0 +1,83 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) + + +load(here("data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "cv", + number = 5, + classProbs = T, + verboseIter = T, + summaryFunction = twoClassSummary +) + +pls.model1 <- train( + correct ~ t + l + s + v + n + tl + ll + sl + ul + vl, + data = train_data, + method = "pls", + trControl = control +) + +pls.model2 <- train( + correct ~ t + n + v, + data = train_data, + method = "pls", + trControl = control +) + +pls.model1 +pls.model2 +varImp(pls.model1) +varImp(pls.model2) + +trellis.par.set(caretTheme()) +densityplot(pls.model1, pch = "|") +densityplot(pls.model2, pch = "|") + +resamps <- resamples(list(model1 = pls.model1, model2 = pls.model2)) +summary(resamps) +dotplot(resamps, metric = "ROC") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.train_predicted1 <- predict(pls.model1, train_data, type="prob") +pls.train_predicted2 <- predict(pls.model2, train_data, type="raw") +pls.predicted1 <- predict(pls.model1, test_data, type="raw") +pls.predicted2 <- predict(pls.model2, test_data, type="raw") + +#FIXME +confusionMatrix(pls.train_predicted1, train_data$correct) +confusionMatrix(pls.train_predicted2, train_data$correct) +confusionMatrix(pls.predicted1, test_data$correct) +confusionMatrix(pls.predicted2, test_data$correct) + +colAUC(pls.predicted1, test_data$correct, plotROC=T) +colAUC(pls.predicted2, test_data$correct, plotROC=T) + diff --git a/data/nback_seqs.Rd b/data/nback_seqs.Rd new file mode 100644 index 0000000..a355010 --- /dev/null +++ b/data/nback_seqs.Rd Binary files differ