diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd new file mode 100644 index 0000000..7e19f45 --- /dev/null +++ b/ccn2019/ccn2019.rev2.Rmd @@ -0,0 +1,229 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: console +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +#library(plsRglm) +#library(plsdof) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) + +``` + +```{r preprocessing} + +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), + ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), + sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), 0, sl), + ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), + al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) + +View() +inspectdf::inspect_cor(seqs) +#inspect_cor(NB,show_plot = T) +``` + +```{r} +model1 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a,data=., family = "gaussian") +aug1 <- augment(model1) +aug1 %>% + ggplot(aes(a,rt)) + + geom_point() + + geom_smooth(aes(y=.fitted, color='red')) +``` + +```{r} +model2 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") +aug2 <- augment(model2) +aug2 %>% + ggplot(aes(jitter(al),rt)) + + geom_point(alpha=0.2,shape=18) + + xlab("accuracy") + + geom_smooth(aes(y=.fitted), color='blue') + + geom_smooth(aes(y=aug1$.fitted), color='red') + +``` + +```{r models} + +nb_split <- initial_split(NB2, prop = 0.75) +training_data <- training(nb_split) +testing_data <- testing(nb_split) +cv_split <- vfold_cv(training_data, v = 5) +cv_data <- cv_split %>% + mutate( + train = map(splits, ~training(.x)), + validate = map(splits, ~testing(.x)) + ) + +cv_models_lm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_glm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_pls_a <- cv_data %>% + mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), + best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% + mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) + ) + +head(cv_models_pls_a) + + +cv_models_pls_a1 <- cv_data[3][[1]] + + +NBx <- NB %>% + group_by(participant) %>% + summarise(freq = as.data.frame(table(stimulus))) + +ggplot(NBx$freq, aes(, group=participant)) + + geom_point(se = F) + + +#%>% +# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) + #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% + #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# Compile cross-validation settings + +#any(is.na(NB2)) +#NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + +``` diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd new file mode 100644 index 0000000..7e19f45 --- /dev/null +++ b/ccn2019/ccn2019.rev2.Rmd @@ -0,0 +1,229 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: console +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +#library(plsRglm) +#library(plsdof) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) + +``` + +```{r preprocessing} + +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), + ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), + sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), 0, sl), + ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), + al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) + +View() +inspectdf::inspect_cor(seqs) +#inspect_cor(NB,show_plot = T) +``` + +```{r} +model1 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a,data=., family = "gaussian") +aug1 <- augment(model1) +aug1 %>% + ggplot(aes(a,rt)) + + geom_point() + + geom_smooth(aes(y=.fitted, color='red')) +``` + +```{r} +model2 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") +aug2 <- augment(model2) +aug2 %>% + ggplot(aes(jitter(al),rt)) + + geom_point(alpha=0.2,shape=18) + + xlab("accuracy") + + geom_smooth(aes(y=.fitted), color='blue') + + geom_smooth(aes(y=aug1$.fitted), color='red') + +``` + +```{r models} + +nb_split <- initial_split(NB2, prop = 0.75) +training_data <- training(nb_split) +testing_data <- testing(nb_split) +cv_split <- vfold_cv(training_data, v = 5) +cv_data <- cv_split %>% + mutate( + train = map(splits, ~training(.x)), + validate = map(splits, ~testing(.x)) + ) + +cv_models_lm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_glm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_pls_a <- cv_data %>% + mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), + best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% + mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) + ) + +head(cv_models_pls_a) + + +cv_models_pls_a1 <- cv_data[3][[1]] + + +NBx <- NB %>% + group_by(participant) %>% + summarise(freq = as.data.frame(table(stimulus))) + +ggplot(NBx$freq, aes(, group=participant)) + + geom_point(se = F) + + +#%>% +# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) + #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% + #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# Compile cross-validation settings + +#any(is.na(NB2)) +#NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + +``` diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd new file mode 100644 index 0000000..ba1b312 --- /dev/null +++ b/ccn2019/ccn2019.rev3.Rmd @@ -0,0 +1,100 @@ + +$P=\langle V,D,C,W \rangle$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +#! =============================================== +#! load required packages + +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) +library(inspectdf) +library(caTools) +library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +``` + + +```{r preprocessing} + +#! =============================================== +#! A function to mark lures in a sequence +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +#! =============================================== +#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! a and al are respectively accuracy and recent accuracy +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), + ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), + ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), + vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), + al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), NA, sl), + tl = ifelse(is.na(tl), NA, tl), + ll = ifelse(is.na(ll), NA, ll), + al = ifelse(is.na(al), NA, al), + ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) +) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + # mutate(dp = map_dbl(local_stats, + # ~length(which(.x$stimulus_type=="target" & .x$correct)) - + # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + # mutate(cr = map_dbl(local_stats, + # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) %>% + unnest(local_stats) %>% + mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% + mutate(stimulus = factor(stimulus)) %>% + mutate(stimulus_type = factor(stimulus_type)) + +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) + +#! =============================================== +#! visualize correlations +#DEBUG inspect_cor(seqs, show_plot = T) + + +``` + +```{r remove_highly_correlated_predictors} +# WIP: This is an extra step for non-pls methods to remove highly correlated predictors +cor_matrix <- cor(seqs[,-1]) +cor_high <- findCorrelation(cor_matrix, 0.8) +high_cor_remove <- row.names(cor_matrix)[cor_high] +#FIXME remove by column name +seqs.uncorr <- seqs %>% select(-high_cor_remove) +``` diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd new file mode 100644 index 0000000..7e19f45 --- /dev/null +++ b/ccn2019/ccn2019.rev2.Rmd @@ -0,0 +1,229 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: console +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +#library(plsRglm) +#library(plsdof) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) + +``` + +```{r preprocessing} + +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), + ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), + sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), 0, sl), + ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), + al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) + +View() +inspectdf::inspect_cor(seqs) +#inspect_cor(NB,show_plot = T) +``` + +```{r} +model1 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a,data=., family = "gaussian") +aug1 <- augment(model1) +aug1 %>% + ggplot(aes(a,rt)) + + geom_point() + + geom_smooth(aes(y=.fitted, color='red')) +``` + +```{r} +model2 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") +aug2 <- augment(model2) +aug2 %>% + ggplot(aes(jitter(al),rt)) + + geom_point(alpha=0.2,shape=18) + + xlab("accuracy") + + geom_smooth(aes(y=.fitted), color='blue') + + geom_smooth(aes(y=aug1$.fitted), color='red') + +``` + +```{r models} + +nb_split <- initial_split(NB2, prop = 0.75) +training_data <- training(nb_split) +testing_data <- testing(nb_split) +cv_split <- vfold_cv(training_data, v = 5) +cv_data <- cv_split %>% + mutate( + train = map(splits, ~training(.x)), + validate = map(splits, ~testing(.x)) + ) + +cv_models_lm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_glm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_pls_a <- cv_data %>% + mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), + best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% + mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) + ) + +head(cv_models_pls_a) + + +cv_models_pls_a1 <- cv_data[3][[1]] + + +NBx <- NB %>% + group_by(participant) %>% + summarise(freq = as.data.frame(table(stimulus))) + +ggplot(NBx$freq, aes(, group=participant)) + + geom_point(se = F) + + +#%>% +# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) + #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% + #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# Compile cross-validation settings + +#any(is.na(NB2)) +#NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + +``` diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd new file mode 100644 index 0000000..ba1b312 --- /dev/null +++ b/ccn2019/ccn2019.rev3.Rmd @@ -0,0 +1,100 @@ + +$P=\langle V,D,C,W \rangle$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +#! =============================================== +#! load required packages + +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) +library(inspectdf) +library(caTools) +library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +``` + + +```{r preprocessing} + +#! =============================================== +#! A function to mark lures in a sequence +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +#! =============================================== +#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! a and al are respectively accuracy and recent accuracy +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), + ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), + ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), + vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), + al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), NA, sl), + tl = ifelse(is.na(tl), NA, tl), + ll = ifelse(is.na(ll), NA, ll), + al = ifelse(is.na(al), NA, al), + ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) +) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + # mutate(dp = map_dbl(local_stats, + # ~length(which(.x$stimulus_type=="target" & .x$correct)) - + # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + # mutate(cr = map_dbl(local_stats, + # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) %>% + unnest(local_stats) %>% + mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% + mutate(stimulus = factor(stimulus)) %>% + mutate(stimulus_type = factor(stimulus_type)) + +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) + +#! =============================================== +#! visualize correlations +#DEBUG inspect_cor(seqs, show_plot = T) + + +``` + +```{r remove_highly_correlated_predictors} +# WIP: This is an extra step for non-pls methods to remove highly correlated predictors +cor_matrix <- cor(seqs[,-1]) +cor_high <- findCorrelation(cor_matrix, 0.8) +high_cor_remove <- row.names(cor_matrix)[cor_high] +#FIXME remove by column name +seqs.uncorr <- seqs %>% select(-high_cor_remove) +``` diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R new file mode 100644 index 0000000..7da6569 --- /dev/null +++ b/ccn2019/ccn2019_diagrams.R @@ -0,0 +1,51 @@ +NB_avg %>% + mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% + ggplot(aes(targets, accuracy, color=factor(cluster))) + + ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + + +NB_avg %>% + ggplot(aes(lures, accuracy, color=condition)) + + ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(skewness, accuracy, color=condition)) + + ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, accuracy, color=condition)) + + ggtitle("lumpiness", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, rts, color=condition)) + + ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + + xlab("lumpiness") + + ylab("Average RT") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +nback <- NB_modified + +nback %>% + mutate(block=as.factor(block)) %>% + mutate(trial=as.factor(trial)) %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + #filter(condition=='3-back') %>% + #mutate(correct=as.numeric(correct)) %>% + inspect_cor(show_plot = T) + +averaged_nback <- NB_avg + +averaged_nback %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + inspect_cor(show_plot = T) + diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd new file mode 100644 index 0000000..7e19f45 --- /dev/null +++ b/ccn2019/ccn2019.rev2.Rmd @@ -0,0 +1,229 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: console +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +#library(plsRglm) +#library(plsdof) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) + +``` + +```{r preprocessing} + +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), + ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), + sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), 0, sl), + ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), + al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) + +View() +inspectdf::inspect_cor(seqs) +#inspect_cor(NB,show_plot = T) +``` + +```{r} +model1 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a,data=., family = "gaussian") +aug1 <- augment(model1) +aug1 %>% + ggplot(aes(a,rt)) + + geom_point() + + geom_smooth(aes(y=.fitted, color='red')) +``` + +```{r} +model2 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") +aug2 <- augment(model2) +aug2 %>% + ggplot(aes(jitter(al),rt)) + + geom_point(alpha=0.2,shape=18) + + xlab("accuracy") + + geom_smooth(aes(y=.fitted), color='blue') + + geom_smooth(aes(y=aug1$.fitted), color='red') + +``` + +```{r models} + +nb_split <- initial_split(NB2, prop = 0.75) +training_data <- training(nb_split) +testing_data <- testing(nb_split) +cv_split <- vfold_cv(training_data, v = 5) +cv_data <- cv_split %>% + mutate( + train = map(splits, ~training(.x)), + validate = map(splits, ~testing(.x)) + ) + +cv_models_lm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_glm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_pls_a <- cv_data %>% + mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), + best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% + mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) + ) + +head(cv_models_pls_a) + + +cv_models_pls_a1 <- cv_data[3][[1]] + + +NBx <- NB %>% + group_by(participant) %>% + summarise(freq = as.data.frame(table(stimulus))) + +ggplot(NBx$freq, aes(, group=participant)) + + geom_point(se = F) + + +#%>% +# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) + #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% + #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# Compile cross-validation settings + +#any(is.na(NB2)) +#NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + +``` diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd new file mode 100644 index 0000000..ba1b312 --- /dev/null +++ b/ccn2019/ccn2019.rev3.Rmd @@ -0,0 +1,100 @@ + +$P=\langle V,D,C,W \rangle$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +#! =============================================== +#! load required packages + +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) +library(inspectdf) +library(caTools) +library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +``` + + +```{r preprocessing} + +#! =============================================== +#! A function to mark lures in a sequence +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +#! =============================================== +#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! a and al are respectively accuracy and recent accuracy +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), + ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), + ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), + vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), + al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), NA, sl), + tl = ifelse(is.na(tl), NA, tl), + ll = ifelse(is.na(ll), NA, ll), + al = ifelse(is.na(al), NA, al), + ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) +) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + # mutate(dp = map_dbl(local_stats, + # ~length(which(.x$stimulus_type=="target" & .x$correct)) - + # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + # mutate(cr = map_dbl(local_stats, + # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) %>% + unnest(local_stats) %>% + mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% + mutate(stimulus = factor(stimulus)) %>% + mutate(stimulus_type = factor(stimulus_type)) + +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) + +#! =============================================== +#! visualize correlations +#DEBUG inspect_cor(seqs, show_plot = T) + + +``` + +```{r remove_highly_correlated_predictors} +# WIP: This is an extra step for non-pls methods to remove highly correlated predictors +cor_matrix <- cor(seqs[,-1]) +cor_high <- findCorrelation(cor_matrix, 0.8) +high_cor_remove <- row.names(cor_matrix)[cor_high] +#FIXME remove by column name +seqs.uncorr <- seqs %>% select(-high_cor_remove) +``` diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R new file mode 100644 index 0000000..7da6569 --- /dev/null +++ b/ccn2019/ccn2019_diagrams.R @@ -0,0 +1,51 @@ +NB_avg %>% + mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% + ggplot(aes(targets, accuracy, color=factor(cluster))) + + ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + + +NB_avg %>% + ggplot(aes(lures, accuracy, color=condition)) + + ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(skewness, accuracy, color=condition)) + + ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, accuracy, color=condition)) + + ggtitle("lumpiness", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, rts, color=condition)) + + ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + + xlab("lumpiness") + + ylab("Average RT") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +nback <- NB_modified + +nback %>% + mutate(block=as.factor(block)) %>% + mutate(trial=as.factor(trial)) %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + #filter(condition=='3-back') %>% + #mutate(correct=as.numeric(correct)) %>% + inspect_cor(show_plot = T) + +averaged_nback <- NB_avg + +averaged_nback %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + inspect_cor(show_plot = T) + diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R new file mode 100644 index 0000000..9bbe23a --- /dev/null +++ b/ccn2019/dummy-vars-playground.R @@ -0,0 +1,111 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +# seqs %>% +# ggplot(aes(x=v,y=a,col=correct)) + +# geom_jitter() + +# geom_point(alpha=0.1) + +# geom_smooth() + +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus +f <- correct ~ n + t + v + stimulus + +set.seed(654321) + +# 1. dummy vars +# INPUTS : seqs +# OUTPUTS: seqs.dmy + +seqs <- seqs %>% + drop_na(rt, correct, tl,sl) + + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +# ROC for each var +filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) + +# model <- cv.glmnet(seqs.train.x, +# seqs.train.y, +# alpha = 1, +# nfolds = 5, +# family = "binomial", +# type.measure = "auc") +# +# model$lambda.min + +ctrl <- trainControl(method="cv", + number=1, + classProbs=T, + verbose = T, +# sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +# glmnet tune +tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) + +max_components <- n_distinct(attr(terms(f),"term.labels")) +# pls tune +tune <- expand.grid(ncomp=1:max_components) + +model <- train(seqs.train.x, + seqs.train.y, + method = "glmnet", + #family = "binomial", + #metric = "ROC", + preProc = c("nzv","center", "scale"), + #verboseIter = TRUE, + tuneLength = 2, + #tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "auc", + boot.n = 100, + ci = T) + + + diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd new file mode 100644 index 0000000..7e19f45 --- /dev/null +++ b/ccn2019/ccn2019.rev2.Rmd @@ -0,0 +1,229 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: console +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +#library(plsRglm) +#library(plsdof) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) + +``` + +```{r preprocessing} + +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), + ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), + sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), 0, sl), + ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), + al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) + +View() +inspectdf::inspect_cor(seqs) +#inspect_cor(NB,show_plot = T) +``` + +```{r} +model1 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a,data=., family = "gaussian") +aug1 <- augment(model1) +aug1 %>% + ggplot(aes(a,rt)) + + geom_point() + + geom_smooth(aes(y=.fitted, color='red')) +``` + +```{r} +model2 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") +aug2 <- augment(model2) +aug2 %>% + ggplot(aes(jitter(al),rt)) + + geom_point(alpha=0.2,shape=18) + + xlab("accuracy") + + geom_smooth(aes(y=.fitted), color='blue') + + geom_smooth(aes(y=aug1$.fitted), color='red') + +``` + +```{r models} + +nb_split <- initial_split(NB2, prop = 0.75) +training_data <- training(nb_split) +testing_data <- testing(nb_split) +cv_split <- vfold_cv(training_data, v = 5) +cv_data <- cv_split %>% + mutate( + train = map(splits, ~training(.x)), + validate = map(splits, ~testing(.x)) + ) + +cv_models_lm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_glm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_pls_a <- cv_data %>% + mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), + best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% + mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) + ) + +head(cv_models_pls_a) + + +cv_models_pls_a1 <- cv_data[3][[1]] + + +NBx <- NB %>% + group_by(participant) %>% + summarise(freq = as.data.frame(table(stimulus))) + +ggplot(NBx$freq, aes(, group=participant)) + + geom_point(se = F) + + +#%>% +# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) + #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% + #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# Compile cross-validation settings + +#any(is.na(NB2)) +#NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + +``` diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd new file mode 100644 index 0000000..ba1b312 --- /dev/null +++ b/ccn2019/ccn2019.rev3.Rmd @@ -0,0 +1,100 @@ + +$P=\langle V,D,C,W \rangle$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +#! =============================================== +#! load required packages + +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) +library(inspectdf) +library(caTools) +library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +``` + + +```{r preprocessing} + +#! =============================================== +#! A function to mark lures in a sequence +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +#! =============================================== +#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! a and al are respectively accuracy and recent accuracy +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), + ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), + ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), + vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), + al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), NA, sl), + tl = ifelse(is.na(tl), NA, tl), + ll = ifelse(is.na(ll), NA, ll), + al = ifelse(is.na(al), NA, al), + ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) +) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + # mutate(dp = map_dbl(local_stats, + # ~length(which(.x$stimulus_type=="target" & .x$correct)) - + # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + # mutate(cr = map_dbl(local_stats, + # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) %>% + unnest(local_stats) %>% + mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% + mutate(stimulus = factor(stimulus)) %>% + mutate(stimulus_type = factor(stimulus_type)) + +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) + +#! =============================================== +#! visualize correlations +#DEBUG inspect_cor(seqs, show_plot = T) + + +``` + +```{r remove_highly_correlated_predictors} +# WIP: This is an extra step for non-pls methods to remove highly correlated predictors +cor_matrix <- cor(seqs[,-1]) +cor_high <- findCorrelation(cor_matrix, 0.8) +high_cor_remove <- row.names(cor_matrix)[cor_high] +#FIXME remove by column name +seqs.uncorr <- seqs %>% select(-high_cor_remove) +``` diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R new file mode 100644 index 0000000..7da6569 --- /dev/null +++ b/ccn2019/ccn2019_diagrams.R @@ -0,0 +1,51 @@ +NB_avg %>% + mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% + ggplot(aes(targets, accuracy, color=factor(cluster))) + + ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + + +NB_avg %>% + ggplot(aes(lures, accuracy, color=condition)) + + ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(skewness, accuracy, color=condition)) + + ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, accuracy, color=condition)) + + ggtitle("lumpiness", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, rts, color=condition)) + + ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + + xlab("lumpiness") + + ylab("Average RT") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +nback <- NB_modified + +nback %>% + mutate(block=as.factor(block)) %>% + mutate(trial=as.factor(trial)) %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + #filter(condition=='3-back') %>% + #mutate(correct=as.numeric(correct)) %>% + inspect_cor(show_plot = T) + +averaged_nback <- NB_avg + +averaged_nback %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + inspect_cor(show_plot = T) + diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R new file mode 100644 index 0000000..9bbe23a --- /dev/null +++ b/ccn2019/dummy-vars-playground.R @@ -0,0 +1,111 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +# seqs %>% +# ggplot(aes(x=v,y=a,col=correct)) + +# geom_jitter() + +# geom_point(alpha=0.1) + +# geom_smooth() + +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus +f <- correct ~ n + t + v + stimulus + +set.seed(654321) + +# 1. dummy vars +# INPUTS : seqs +# OUTPUTS: seqs.dmy + +seqs <- seqs %>% + drop_na(rt, correct, tl,sl) + + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +# ROC for each var +filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) + +# model <- cv.glmnet(seqs.train.x, +# seqs.train.y, +# alpha = 1, +# nfolds = 5, +# family = "binomial", +# type.measure = "auc") +# +# model$lambda.min + +ctrl <- trainControl(method="cv", + number=1, + classProbs=T, + verbose = T, +# sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +# glmnet tune +tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) + +max_components <- n_distinct(attr(terms(f),"term.labels")) +# pls tune +tune <- expand.grid(ncomp=1:max_components) + +model <- train(seqs.train.x, + seqs.train.y, + method = "glmnet", + #family = "binomial", + #metric = "ROC", + preProc = c("nzv","center", "scale"), + #verboseIter = TRUE, + tuneLength = 2, + #tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "auc", + boot.n = 100, + ci = T) + + + diff --git a/ccn2019/pls_playground.Rmd b/ccn2019/pls_playground.Rmd new file mode 100644 index 0000000..c566656 --- /dev/null +++ b/ccn2019/pls_playground.Rmd @@ -0,0 +1,104 @@ +--- +title: "PLS Training" +output: html_notebook +editor_options: + chunk_output_type: console +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error + +best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +``` +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` \ No newline at end of file diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd new file mode 100644 index 0000000..7e19f45 --- /dev/null +++ b/ccn2019/ccn2019.rev2.Rmd @@ -0,0 +1,229 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: console +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +#library(plsRglm) +#library(plsdof) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) + +``` + +```{r preprocessing} + +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), + ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), + sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), 0, sl), + ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), + al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) + +View() +inspectdf::inspect_cor(seqs) +#inspect_cor(NB,show_plot = T) +``` + +```{r} +model1 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a,data=., family = "gaussian") +aug1 <- augment(model1) +aug1 %>% + ggplot(aes(a,rt)) + + geom_point() + + geom_smooth(aes(y=.fitted, color='red')) +``` + +```{r} +model2 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") +aug2 <- augment(model2) +aug2 %>% + ggplot(aes(jitter(al),rt)) + + geom_point(alpha=0.2,shape=18) + + xlab("accuracy") + + geom_smooth(aes(y=.fitted), color='blue') + + geom_smooth(aes(y=aug1$.fitted), color='red') + +``` + +```{r models} + +nb_split <- initial_split(NB2, prop = 0.75) +training_data <- training(nb_split) +testing_data <- testing(nb_split) +cv_split <- vfold_cv(training_data, v = 5) +cv_data <- cv_split %>% + mutate( + train = map(splits, ~training(.x)), + validate = map(splits, ~testing(.x)) + ) + +cv_models_lm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_glm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_pls_a <- cv_data %>% + mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), + best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% + mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) + ) + +head(cv_models_pls_a) + + +cv_models_pls_a1 <- cv_data[3][[1]] + + +NBx <- NB %>% + group_by(participant) %>% + summarise(freq = as.data.frame(table(stimulus))) + +ggplot(NBx$freq, aes(, group=participant)) + + geom_point(se = F) + + +#%>% +# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) + #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% + #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# Compile cross-validation settings + +#any(is.na(NB2)) +#NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + +``` diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd new file mode 100644 index 0000000..ba1b312 --- /dev/null +++ b/ccn2019/ccn2019.rev3.Rmd @@ -0,0 +1,100 @@ + +$P=\langle V,D,C,W \rangle$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +#! =============================================== +#! load required packages + +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) +library(inspectdf) +library(caTools) +library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +``` + + +```{r preprocessing} + +#! =============================================== +#! A function to mark lures in a sequence +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +#! =============================================== +#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! a and al are respectively accuracy and recent accuracy +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), + ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), + ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), + vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), + al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), NA, sl), + tl = ifelse(is.na(tl), NA, tl), + ll = ifelse(is.na(ll), NA, ll), + al = ifelse(is.na(al), NA, al), + ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) +) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + # mutate(dp = map_dbl(local_stats, + # ~length(which(.x$stimulus_type=="target" & .x$correct)) - + # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + # mutate(cr = map_dbl(local_stats, + # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) %>% + unnest(local_stats) %>% + mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% + mutate(stimulus = factor(stimulus)) %>% + mutate(stimulus_type = factor(stimulus_type)) + +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) + +#! =============================================== +#! visualize correlations +#DEBUG inspect_cor(seqs, show_plot = T) + + +``` + +```{r remove_highly_correlated_predictors} +# WIP: This is an extra step for non-pls methods to remove highly correlated predictors +cor_matrix <- cor(seqs[,-1]) +cor_high <- findCorrelation(cor_matrix, 0.8) +high_cor_remove <- row.names(cor_matrix)[cor_high] +#FIXME remove by column name +seqs.uncorr <- seqs %>% select(-high_cor_remove) +``` diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R new file mode 100644 index 0000000..7da6569 --- /dev/null +++ b/ccn2019/ccn2019_diagrams.R @@ -0,0 +1,51 @@ +NB_avg %>% + mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% + ggplot(aes(targets, accuracy, color=factor(cluster))) + + ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + + +NB_avg %>% + ggplot(aes(lures, accuracy, color=condition)) + + ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(skewness, accuracy, color=condition)) + + ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, accuracy, color=condition)) + + ggtitle("lumpiness", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, rts, color=condition)) + + ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + + xlab("lumpiness") + + ylab("Average RT") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +nback <- NB_modified + +nback %>% + mutate(block=as.factor(block)) %>% + mutate(trial=as.factor(trial)) %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + #filter(condition=='3-back') %>% + #mutate(correct=as.numeric(correct)) %>% + inspect_cor(show_plot = T) + +averaged_nback <- NB_avg + +averaged_nback %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + inspect_cor(show_plot = T) + diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R new file mode 100644 index 0000000..9bbe23a --- /dev/null +++ b/ccn2019/dummy-vars-playground.R @@ -0,0 +1,111 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +# seqs %>% +# ggplot(aes(x=v,y=a,col=correct)) + +# geom_jitter() + +# geom_point(alpha=0.1) + +# geom_smooth() + +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus +f <- correct ~ n + t + v + stimulus + +set.seed(654321) + +# 1. dummy vars +# INPUTS : seqs +# OUTPUTS: seqs.dmy + +seqs <- seqs %>% + drop_na(rt, correct, tl,sl) + + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +# ROC for each var +filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) + +# model <- cv.glmnet(seqs.train.x, +# seqs.train.y, +# alpha = 1, +# nfolds = 5, +# family = "binomial", +# type.measure = "auc") +# +# model$lambda.min + +ctrl <- trainControl(method="cv", + number=1, + classProbs=T, + verbose = T, +# sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +# glmnet tune +tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) + +max_components <- n_distinct(attr(terms(f),"term.labels")) +# pls tune +tune <- expand.grid(ncomp=1:max_components) + +model <- train(seqs.train.x, + seqs.train.y, + method = "glmnet", + #family = "binomial", + #metric = "ROC", + preProc = c("nzv","center", "scale"), + #verboseIter = TRUE, + tuneLength = 2, + #tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "auc", + boot.n = 100, + ci = T) + + + diff --git a/ccn2019/pls_playground.Rmd b/ccn2019/pls_playground.Rmd new file mode 100644 index 0000000..c566656 --- /dev/null +++ b/ccn2019/pls_playground.Rmd @@ -0,0 +1,104 @@ +--- +title: "PLS Training" +output: html_notebook +editor_options: + chunk_output_type: console +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error + +best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +``` +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` \ No newline at end of file diff --git a/ccn2019_diagrams.R b/ccn2019_diagrams.R deleted file mode 100644 index 7da6569..0000000 --- a/ccn2019_diagrams.R +++ /dev/null @@ -1,51 +0,0 @@ -NB_avg %>% - mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% - ggplot(aes(targets, accuracy, color=factor(cluster))) + - ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - - -NB_avg %>% - ggplot(aes(lures, accuracy, color=condition)) + - ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(skewness, accuracy, color=condition)) + - ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(lumpiness, accuracy, color=condition)) + - ggtitle("lumpiness", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(lumpiness, rts, color=condition)) + - ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + - xlab("lumpiness") + - ylab("Average RT") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -nback <- NB_modified - -nback %>% - mutate(block=as.factor(block)) %>% - mutate(trial=as.factor(trial)) %>% - mutate(condition=ifelse(condition=='2-back',2,3)) %>% - #filter(condition=='3-back') %>% - #mutate(correct=as.numeric(correct)) %>% - inspect_cor(show_plot = T) - -averaged_nback <- NB_avg - -averaged_nback %>% - mutate(condition=ifelse(condition=='2-back',2,3)) %>% - inspect_cor(show_plot = T) - diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd new file mode 100644 index 0000000..7e19f45 --- /dev/null +++ b/ccn2019/ccn2019.rev2.Rmd @@ -0,0 +1,229 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: console +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +#library(plsRglm) +#library(plsdof) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) + +``` + +```{r preprocessing} + +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), + ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), + sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), 0, sl), + ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), + al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) + +View() +inspectdf::inspect_cor(seqs) +#inspect_cor(NB,show_plot = T) +``` + +```{r} +model1 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a,data=., family = "gaussian") +aug1 <- augment(model1) +aug1 %>% + ggplot(aes(a,rt)) + + geom_point() + + geom_smooth(aes(y=.fitted, color='red')) +``` + +```{r} +model2 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") +aug2 <- augment(model2) +aug2 %>% + ggplot(aes(jitter(al),rt)) + + geom_point(alpha=0.2,shape=18) + + xlab("accuracy") + + geom_smooth(aes(y=.fitted), color='blue') + + geom_smooth(aes(y=aug1$.fitted), color='red') + +``` + +```{r models} + +nb_split <- initial_split(NB2, prop = 0.75) +training_data <- training(nb_split) +testing_data <- testing(nb_split) +cv_split <- vfold_cv(training_data, v = 5) +cv_data <- cv_split %>% + mutate( + train = map(splits, ~training(.x)), + validate = map(splits, ~testing(.x)) + ) + +cv_models_lm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_glm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_pls_a <- cv_data %>% + mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), + best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% + mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) + ) + +head(cv_models_pls_a) + + +cv_models_pls_a1 <- cv_data[3][[1]] + + +NBx <- NB %>% + group_by(participant) %>% + summarise(freq = as.data.frame(table(stimulus))) + +ggplot(NBx$freq, aes(, group=participant)) + + geom_point(se = F) + + +#%>% +# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) + #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% + #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# Compile cross-validation settings + +#any(is.na(NB2)) +#NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + +``` diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd new file mode 100644 index 0000000..ba1b312 --- /dev/null +++ b/ccn2019/ccn2019.rev3.Rmd @@ -0,0 +1,100 @@ + +$P=\langle V,D,C,W \rangle$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +#! =============================================== +#! load required packages + +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) +library(inspectdf) +library(caTools) +library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +``` + + +```{r preprocessing} + +#! =============================================== +#! A function to mark lures in a sequence +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +#! =============================================== +#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! a and al are respectively accuracy and recent accuracy +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), + ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), + ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), + vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), + al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), NA, sl), + tl = ifelse(is.na(tl), NA, tl), + ll = ifelse(is.na(ll), NA, ll), + al = ifelse(is.na(al), NA, al), + ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) +) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + # mutate(dp = map_dbl(local_stats, + # ~length(which(.x$stimulus_type=="target" & .x$correct)) - + # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + # mutate(cr = map_dbl(local_stats, + # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) %>% + unnest(local_stats) %>% + mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% + mutate(stimulus = factor(stimulus)) %>% + mutate(stimulus_type = factor(stimulus_type)) + +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) + +#! =============================================== +#! visualize correlations +#DEBUG inspect_cor(seqs, show_plot = T) + + +``` + +```{r remove_highly_correlated_predictors} +# WIP: This is an extra step for non-pls methods to remove highly correlated predictors +cor_matrix <- cor(seqs[,-1]) +cor_high <- findCorrelation(cor_matrix, 0.8) +high_cor_remove <- row.names(cor_matrix)[cor_high] +#FIXME remove by column name +seqs.uncorr <- seqs %>% select(-high_cor_remove) +``` diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R new file mode 100644 index 0000000..7da6569 --- /dev/null +++ b/ccn2019/ccn2019_diagrams.R @@ -0,0 +1,51 @@ +NB_avg %>% + mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% + ggplot(aes(targets, accuracy, color=factor(cluster))) + + ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + + +NB_avg %>% + ggplot(aes(lures, accuracy, color=condition)) + + ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(skewness, accuracy, color=condition)) + + ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, accuracy, color=condition)) + + ggtitle("lumpiness", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, rts, color=condition)) + + ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + + xlab("lumpiness") + + ylab("Average RT") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +nback <- NB_modified + +nback %>% + mutate(block=as.factor(block)) %>% + mutate(trial=as.factor(trial)) %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + #filter(condition=='3-back') %>% + #mutate(correct=as.numeric(correct)) %>% + inspect_cor(show_plot = T) + +averaged_nback <- NB_avg + +averaged_nback %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + inspect_cor(show_plot = T) + diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R new file mode 100644 index 0000000..9bbe23a --- /dev/null +++ b/ccn2019/dummy-vars-playground.R @@ -0,0 +1,111 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +# seqs %>% +# ggplot(aes(x=v,y=a,col=correct)) + +# geom_jitter() + +# geom_point(alpha=0.1) + +# geom_smooth() + +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus +f <- correct ~ n + t + v + stimulus + +set.seed(654321) + +# 1. dummy vars +# INPUTS : seqs +# OUTPUTS: seqs.dmy + +seqs <- seqs %>% + drop_na(rt, correct, tl,sl) + + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +# ROC for each var +filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) + +# model <- cv.glmnet(seqs.train.x, +# seqs.train.y, +# alpha = 1, +# nfolds = 5, +# family = "binomial", +# type.measure = "auc") +# +# model$lambda.min + +ctrl <- trainControl(method="cv", + number=1, + classProbs=T, + verbose = T, +# sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +# glmnet tune +tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) + +max_components <- n_distinct(attr(terms(f),"term.labels")) +# pls tune +tune <- expand.grid(ncomp=1:max_components) + +model <- train(seqs.train.x, + seqs.train.y, + method = "glmnet", + #family = "binomial", + #metric = "ROC", + preProc = c("nzv","center", "scale"), + #verboseIter = TRUE, + tuneLength = 2, + #tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "auc", + boot.n = 100, + ci = T) + + + diff --git a/ccn2019/pls_playground.Rmd b/ccn2019/pls_playground.Rmd new file mode 100644 index 0000000..c566656 --- /dev/null +++ b/ccn2019/pls_playground.Rmd @@ -0,0 +1,104 @@ +--- +title: "PLS Training" +output: html_notebook +editor_options: + chunk_output_type: console +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error + +best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +``` +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` \ No newline at end of file diff --git a/ccn2019_diagrams.R b/ccn2019_diagrams.R deleted file mode 100644 index 7da6569..0000000 --- a/ccn2019_diagrams.R +++ /dev/null @@ -1,51 +0,0 @@ -NB_avg %>% - mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% - ggplot(aes(targets, accuracy, color=factor(cluster))) + - ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - - -NB_avg %>% - ggplot(aes(lures, accuracy, color=condition)) + - ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(skewness, accuracy, color=condition)) + - ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(lumpiness, accuracy, color=condition)) + - ggtitle("lumpiness", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(lumpiness, rts, color=condition)) + - ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + - xlab("lumpiness") + - ylab("Average RT") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -nback <- NB_modified - -nback %>% - mutate(block=as.factor(block)) %>% - mutate(trial=as.factor(trial)) %>% - mutate(condition=ifelse(condition=='2-back',2,3)) %>% - #filter(condition=='3-back') %>% - #mutate(correct=as.numeric(correct)) %>% - inspect_cor(show_plot = T) - -averaged_nback <- NB_avg - -averaged_nback %>% - mutate(condition=ifelse(condition=='2-back',2,3)) %>% - inspect_cor(show_plot = T) - diff --git a/dummy-vars-playground.R b/dummy-vars-playground.R deleted file mode 100644 index d79b629..0000000 --- a/dummy-vars-playground.R +++ /dev/null @@ -1,114 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -# seqs %>% -# ggplot(aes(x=v,y=a,col=correct)) + -# geom_jitter() + -# geom_point(alpha=0.1) + -# geom_smooth() - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -# 1. dummy vars -# INPUTS : seqs -# OUTPUTS: seqs.dmy - -seqs <- seqs %>% - drop_na(rt, correct, tl,sl) - - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced -# seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -# ROC for each var -filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) - -# model <- cv.glmnet(seqs.train.x, -# seqs.train.y, -# alpha = 1, -# nfolds = 5, -# family = "binomial", -# type.measure = "auc") -# -# model$lambda.min - -ctrl <- trainControl(method="cv", - number=5, - classProbs=T, - verbose = T, -# sampling = "up", - savePredictions = T, - summaryFunction=twoClassSummary) - -# glmnet tune -tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) - -max_components <- n_distinct(attr(terms(f),"term.labels")) -# pls tune -tune <- expand.grid(ncomp=1:max_components) - -model <- train(seqs.train.x, - seqs.train.y, - method = "pls", - family = "binomial", - metric = "ROC", - preProc = c("center", "scale"), - verboseIter = TRUE, - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R deleted file mode 100644 index adfb442..0000000 --- a/ccn2019-accuracy.R +++ /dev/null @@ -1,108 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T -) - -pls.new_model <- train( - a ~ .-al-dp-cr-rt-correct, - data = train_data.imbalanced, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - -plot(varImp(pls.new_model), main="Variable Importance for Accuracy") - - -pls.old_model <- train( - a ~ t + n + v, - data = train_data, - method = "pls", - preProcess = c("center","scale"), - trControl = control -) - - -pls.old_model -pls.new_model -plot(varImp(pls.old_model)) - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.old_model, pch = "|") - -resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) -summary(resamps) -dotplot(resamps, metric = "Rsquared") -difValues <- diff(resamps) -bwplot(difValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.old_predicted <- predict(pls.old_model, test_data, type="raw") - - -summary(pls.new_model) - - -# SSE and RMSE - -SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors -SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.new_predicted)^2) -RMSE <- sqrt(SSE/length(pls.new_predicted)) - - -SSE <- sum((test_data$a - pls.old_predicted)^2) -R_square <- 1 - SSE/SST -SSE <- sum((test_data$a - pls.old_predicted)^2) -RMSE <- sqrt(SSE/length(pls.old_predicted)) - - -as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=lm,se=F) + - ggtitle("Accuracy: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-correct.R b/ccn2019-correct.R deleted file mode 100644 index 9b9d9fb..0000000 --- a/ccn2019-correct.R +++ /dev/null @@ -1,148 +0,0 @@ -#==================================================# -# model the "correct" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(pls) - -#devtools::install_github("sachsmc/plotROC") -library(plotROC) - - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -inspect_cat(seqs.imputed) -inspect_num(seqs.imputed) - -seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) - -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - classProbs = T, - verboseIter = T, - savePredictions = T, - sampling = "down", - selectionFunction = "oneSE" -) - -pls.new_model <- train( - correct ~ .-a-al-dp-cr-rt, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") - -pls.common_model <- train( - correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, - data = train_data, - method = "pls", - metric = "Accuracy", - tuneLength = 20, - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.common_model) -plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") - - -#trellis.par.set(caretTheme()) -#densityplot(pls.new_model, pch = "|") -#densityplot(pls.common_model, pch = "|") - -# Compile models and compare performance -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -#DEBUG summary(pls.models) -#DEBUG dotplot(pls.models) -#DEBUG diffValues <- diff(resamps) -bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") - - -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") -pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") - -confusionMatrix(pls.new_predicted, test_data$correct) -confusionMatrix(pls.common_predicted, test_data$correct) - -library(pROC) -par(pty="s") - -roc(test_data$correct, - pls.common_predicted_prob$CORRECT, - plot = T, - legacy.axes=T, - lwd=2, - col="darkgrey", - lty = 3, - print.auc = T, - print.auc.y = 45, - print.auc.x = 55, - percent = T, - ci = T, - boot.n = 100 - ) - - - -# roc_test_indices <- createDataPartition(test_data$correct, -# times = 10, -# p = 0.9, -# list = F) - -#for (i in 1:ncol(roc_test_indices)) { -# test_sample_correct <- test_data[roc_test_indices[,i],]$correct -# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT - -# plot.roc(test_sample_correct, -# predprob_sample_correct, -roc(test_data$correct, - pls.new_predicted_prob$CORRECT, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - add=T, - of = "se", - boot.n = 100, - ci = T) - -#} - -legend(100,100, legend=c("New Model", "Common Model"), - col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) - -# requires plotROC package -#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + -#DEBUG geom_roc() diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R deleted file mode 100644 index 5bcefba..0000000 --- a/ccn2019-criterion.R +++ /dev/null @@ -1,119 +0,0 @@ -#==================================================# -# model the "accuract" column (a for global, and al for local accuracy) - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - cr ~ .-a-al-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) - -ggplot(varImp(pls.new_model)) + - labs(title="Criterion - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R deleted file mode 100644 index 7eea931..0000000 --- a/ccn2019-elasticnet.R +++ /dev/null @@ -1,52 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -set.seed(10) - -ctrl <- trainControl( - method = "cv", - number = 10, - classProbs = T, - summaryFunction = twoClassSummary -) - -grid <- expand.grid( - alpha = seq(0,1,length=10), - lambda = seq(0.0001, 0.1, length=10) -) - -model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - preProcess = c("nzv","center","scale"), # or c("knnImpute"), - tuneGrid = grid, - trControl = ctrl) - -model -plot(varImp(model, useModel = F)) -plot(model) -max(model$results$ROC) diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R deleted file mode 100644 index ae5dc98..0000000 --- a/ccn2019-featsel.R +++ /dev/null @@ -1,38 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) -seqs <- seqs %>% drop_na(rt, correct, tl,sl) - -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -ctrl <- rfeControl(functions = lrFuncs, - method = "cv", - number = 10, - verbose = T) - -rmProfile <- rfe(seqs.train.x, seqs.train.y, - preProcess = c("nzv"), - rfeControl = ctrl) - -rmProfile diff --git a/ccn2019-rt.R b/ccn2019-rt.R deleted file mode 100644 index 4f6defb..0000000 --- a/ccn2019-rt.R +++ /dev/null @@ -1,117 +0,0 @@ -#==================================================# -# model the "RT" column - -library(here) -library(tidyverse) -library(caret) -library(inspectdf) -library(skimr) -library(ROSE) - -load(here("notebooks/data/nback_seqs.Rd")) - -set.seed(42) - -seqs.imputed <- seqs %>% - filter(!is.na(correct), !is.na(rt)) %>% - mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) - -#DEBUG inspect_num(seqs.imputed) - -#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) - - -#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] - -train_indexes <- createDataPartition(seqs.imputed$correct, - times = 1, - p = 0.7, - list = F) -train_data <- seqs.imputed[train_indexes,] -test_data <- seqs.imputed[-train_indexes,] - -train_data.imbalanced <- ROSE(correct ~ ., - data = train_data, - seed = 1)$data - -# VIsualize split -train_data.imbalanced$grp <- "train" -test_data$grp <- "test" - -rbind(train_data.imbalanced, test_data) %>% - ggplot(aes(x=correct, fill=grp)) + - geom_histogram(stat="count", position='dodge') + - labs(title="Imbalanced Split") - -control <- trainControl( - method = "repeatedcv", - number = 5, - repeats = 2, - verboseIter = T, - savePredictions = T -) - -train_data <- train_data.imbalanced %>% select(-grp) - -pls.new_model <- train( - rt ~ .-a-al-cr-dp-rt-correct, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -plot(pls.new_model) -summary(pls.new_model) -plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") - -pls.common_model <- train( - cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, - data = train_data, - method = "pls", - preProcess = c("zv","center","scale"), - trControl = control -) - -summary(pls.common_model) -plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") - -trellis.par.set(caretTheme()) -densityplot(pls.new_model, pch = "|") -densityplot(pls.common_model, pch = "|") - -pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) -summary(pls.models) -dotplot(pls.models, metric = "Rsquared") -diffValues <- diff(pls.models) -bwplot(diffValues, layout=c(1,3)) - - -pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") -pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") -pls.new_predicted <- predict(pls.new_model, test_data, type="raw") -pls.common_predicted <- predict(pls.common_model, test_data, type="raw") - -# SSE and RMSE -# -# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors -# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.new_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.new_predicted)) -# -# -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# R_square <- 1 - SSE/SST -# SSE <- sum((test_data$cr - pls.common_predicted)^2) -# RMSE <- sqrt(SSE/length(pls.common_predicted)) -# - -as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% - ggplot(aes(predicted, observed)) + - coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + - geom_point(alpha = 0.1,shape=16) + - geom_smooth(method=glm) + - ggtitle("Criterion: Predicted vs Actual") + - xlab("Predecited") + - ylab("Observed") diff --git a/ccn2019-svm.R b/ccn2019-svm.R deleted file mode 100644 index a1590a5..0000000 --- a/ccn2019-svm.R +++ /dev/null @@ -1,79 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -seqs <- seqs %>%drop_na(rt, correct, tl,sl) - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - -ctrl <- trainControl(method="cv", - number=10, -# repeats = 1, #repeatedcv - sampling = "up", - savePredictions = T, - verbose = T) - -tune <- expand.grid(C = seq(0,5,by=0.25)) - -model <- train(seqs.train.x, - seqs.train.y, - method = "svmLinear", - preProc = c("center", "scale"), - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd deleted file mode 100644 index 9220ff4..0000000 --- a/ccn2019.rev0.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: inline ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(plsRglm) -library(plsdof) -library(caret) -``` - -```{r params} -load('./data/CL2015.RData') - -window_size <- 8 -``` - - - -```{r history} - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -with_history <- function(stimuli, length=16, fixed=F) { - seq <- paste(stimuli, collapse = '') - - sapply(1:length(stimuli), function(i) { - stri_reverse(str_sub(seq, max(1,i-length+1), i)) - }) - #ifelse(fixed, h[str_length(h)==size], h) -} - -# $x_{s,local}$ -with_skewness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - sum(sort(freqs, decreasing = T)[1:2]) - 1 - }) -} - -# $x_{u,local}$ -with_lumpiness <- function(history) { - sapply(history, function(h) { - freqs <- table(unlist(str_split(h,""))) - max(freqs) - 1 - }) -} - - -with_targets_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="target"]) / length(trials) - }) -} - -with_lures_ratio <- function(stimulus_type, length=16) { - sapply(1:length(stimulus_type), function(i) { - trials <- stimulus_type[max(1,i-length):i] - length(trials[trials=="lure"]) / length(trials) - }) -} - -#TODO change to list column workflow with broom for model fitting and evaluating the fits -# duh! we are using list nested insided a tibble, so put all new columns in a new list column -# instead of adding a new column for each. -NB2 <- NB %>% - group_by(participant, condition, block) %>% - nest() %>% unnest(data) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(history = with_history(stimulus, window_size)) %>% - mutate(x_sl = with_skewness(history)) %>% - mutate(x_ul = with_lumpiness(history)) %>% - mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% - mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% - ungroup() - -pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) -NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# caret -library(caret) -# Compile cross-validation settings - - -any(is.na(NB2)) -NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - - -plsResult -``` - - - ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` - - - ---- -title: "PLS Training" -output: html_notebook ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -```{r plsrglm} -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` - - - diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd deleted file mode 100644 index 9074227..0000000 --- a/ccn2019.rev1.Rmd +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: "Evaluating N-Back Sequences" ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE) -library(tidyverse) -library(ggplot2) -library(stringi) -library(GA) -library(dbscan) -library(inspectdf) - -load('./data/CL2015.RData') -``` - -### Variables -- $T$ number of targets -- $L$ number of lures -- $S$ Skewness score -- $U$ Uniformity (!repetition) -- $RT_{mean}$ -- $Accuracy_{mean}$ -- $dprime$ -- $criterion$ - -## Constraints - -- fixed number of targets -- fixed number of lures (a.k.a, foils) -- uniform distribution of choices -- controlled local lumpiness - - -Each constraint is an up side down quadratic function to be minimized. - -```{r, eval=F} -targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) -skewness_cost <- function(x, choices) { - uniform_ratio <- length(x) / length(choices) - deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) - for (c in choices) { - deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) - } - #TODO convert to gaussian loss - max(deviation_from_uniform) -} - -lumpiness_cost <- function(x, choices) { - #trials = len(seq) - #freqs = [float(seq.count(c)) for c in choices] - #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) - #return ralph_skewed - NA -} - -#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) -#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) -#plot(GA) -``` - - -```{r} - -with_lures <- function(condition, stim, stim_type, history = NA) { - sapply(1:length(stim), - function(i) { - switch(as.character(condition[i]), - "2-back" = { - ifelse( - stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), - 'lure', - as.character(stim_type[i]) - )}, - "3-back" = { - ifelse( - stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), - 'lure', - as.character(stim_type[i]) - )} - ) - - }) -} - -with_targets_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-stri_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="target"]) - }) -} - -with_lures_ratio <- function(stimulus_type, history) { - sapply(1:length(history), function(i) { - trials <- stimulus_type[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - length(trials[trials=="lure"]) - }) -} - -with_lumpiness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - max(table(trials)) - 1 - }) -} - -with_lag <- function(stimulus, history) { - # find last occurance the of stimulus -} - -with_skewness_score <- function(stimulus, history) { - sapply(1:length(history), function(i) { - trials <- stimulus[(i-str_length(history[i])):i] - trials <- unlist(trials, use.names=FALSE) - sum(sort(table(trials), decreasing = T)[1:2]) - 1 - }) -} - -with_history <- function(stims, size=16) { - res <- c('') - for (i in 2:length(stims)) { - res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) - } - #res <- ifelse(stri_length(res)==size, res, NA) - res -} - -normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { - #TODO - sapply(1:length(targets_ratio), function(i) 0) -} - -window_size <- 8 - -NB_modified <- NB %>% - group_by(participant, condition, block) %>% - mutate(history = with_history(stimulus, size=window_size)) %>% - #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) - mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% - mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% - mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% - mutate(skewness = with_skewness_score(stimulus, history)) %>% - mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% - filter(stri_length(history)==window_size) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% - ungroup() - -pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) - -NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - - -## participant-level averaged NB, a single row represent an observation for a single subject -## in a single condition -NB_avg <- NB_modified %>% - group_by(participant, condition) %>% - mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% - summarise( - targets=sum(targets), - lures=sum(lures), - skewness=sum(skewness), - lumpiness=sum(lumpiness), - rt = mean(rt, na.rm=T), - correct=sum(correct,na.rm=T)/90) %>% - ungroup() - -# print -# NB_modified %>% -# filter(participant=='P1') %>% -# View() -# - - -fit <- lm(correct ~ t * s * u * l * d, NB_modified) - -``` - - -```{r} - -# DBSCAN Clustering (RT+ACCURACY against skewness) -NB_avg <- NB_avg %>% - mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) - -NB_avg %>% - ggplot(aes(skewness, correct, color=factor(cluster))) + - ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - -``` - - -```{r} -## single-subject figures -NB_modified %>% - ggplot(aes(t,s,color=correct)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - inspect_cor(show_plot = T) - -NB_avg %>% - inspect_cor(show_plot = T) - -NB_modified %>% - ggplot(aes(rt,correct,color=u)) + - geom_jitter() + - geom_point() + - stat_summary(fun.y="mean") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(t),color=correct)) + - geom_jitter() + - geom_point(alpha=0.1) - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(s),jitter(u),color=correct)) + - geom_jitter() + - geom_point() + - facet_wrap(~condition) - -# rt/accuracy and lures -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + - geom_jitter() + - geom_point(shape=16) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,pc2,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,rt,color=correct)) + - geom_point(alpha=0.3) + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - -NB_modified %>% - filter(!is.na(correct)) %>% - ggplot(aes(pc1,correct,color=correct)) + - geom_point() + - geom_smooth(method="lm",se = F) + - facet_wrap(~condition, scales="free") - - -``` - -## TODO - - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - - constraint3=fitness(history)) - - kmeans(NB) - - ggplot(kmeans_clusters$accuracy) - - ggplot(kmeans_clusters$rt) - - - - - - -```{python} -a=2 -p -``` diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd deleted file mode 100644 index 7e19f45..0000000 --- a/ccn2019.rev2.Rmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: "Statistical Properties of the N-Back Sequences" -output: - html_notebook: default - pdf_document: default -editor_options: - chunk_output_type: console ---- - -# Problems - -Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. - -# Gaps - -- Unclear how to parameterize interesting variations for sequence generation -- How do we model these multiple strategies (which requires identifying which sequence variations matter) - - local vs. global properties, which one matters the most? - - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - - Global: No lures, large vocabulary -> pattern repeats implies a target - - -## Formulating Generating the N-Back Sequences as a CSP instance - -$P=\langle V,D,C,W\rangle$ - -$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ - -$D=\{\}$ - - -Constraints: - -$$ -\\ - -x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| - -\\\\ - -x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| - -\\\\ - -x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| - -\\\\ - -x_{l} = L \times trials -\\\\ - -x_{ll} = L \times w -\\\\ - -x_{v} = |V| -\\ - -x_{ul} = w -\\\\ - -x_{s} = {trials \over |V|} -\\\\ - -x_{sl} = max(1, {w \over |V|}) -\\\\ - -x_{g} = {trials \over w} - -\\\\ - -x_{vl} = min(|V|, w) -$$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -#library(plsRglm) -#library(plsdof) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) - -``` - -```{r preprocessing} - -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), - ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), - sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), 0, sl), - ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), - al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) - -View() -inspectdf::inspect_cor(seqs) -#inspect_cor(NB,show_plot = T) -``` - -```{r} -model1 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a,data=., family = "gaussian") -aug1 <- augment(model1) -aug1 %>% - ggplot(aes(a,rt)) + - geom_point() + - geom_smooth(aes(y=.fitted, color='red')) -``` - -```{r} -model2 <- NB2 %>% - select(-participant, -stimulus) %>% - glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") -aug2 <- augment(model2) -aug2 %>% - ggplot(aes(jitter(al),rt)) + - geom_point(alpha=0.2,shape=18) + - xlab("accuracy") + - geom_smooth(aes(y=.fitted), color='blue') + - geom_smooth(aes(y=aug1$.fitted), color='red') - -``` - -```{r models} - -nb_split <- initial_split(NB2, prop = 0.75) -training_data <- training(nb_split) -testing_data <- testing(nb_split) -cv_split <- vfold_cv(training_data, v = 5) -cv_data <- cv_split %>% - mutate( - train = map(splits, ~training(.x)), - validate = map(splits, ~testing(.x)) - ) - -cv_models_lm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_glm_a <- cv_data %>% - mutate(model = map(train, ~lm(formula = a~., data = .x)), - tidied = map(model, tidy), - glanced = map(model, glance), - augment = map(model, augment)) - -cv_models_pls_a <- cv_data %>% - mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), - best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% - mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) - ) - -head(cv_models_pls_a) - - -cv_models_pls_a1 <- cv_data[3][[1]] - - -NBx <- NB %>% - group_by(participant) %>% - summarise(freq = as.data.frame(table(stimulus))) - -ggplot(NBx$freq, aes(, group=participant)) + - geom_point(se = F) - - -#%>% -# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) - #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% - #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) - -# Compile cross-validation settings - -#any(is.na(NB2)) -#NB2 <- na.omit(NB2) - -# set.seed(100) -# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) -# -# # PLS -# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], -# method = "pls", -# metric = "Accuracy", -# tuneLength = 20, -# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), -# preProc = c("zv","center","scale")) -# -# # Check CV -# plot(mod1) - - -#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) -#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) - -``` diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd deleted file mode 100644 index ba1b312..0000000 --- a/ccn2019.rev3.Rmd +++ /dev/null @@ -1,100 +0,0 @@ - -$P=\langle V,D,C,W \rangle$ - -```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} -#! =============================================== -#! load required packages - -library(ggplot2) -library(tidyverse) -library(stringi) -library(pls) -library(caret) -library(here) -library(tsibble) -library(broom) -library(rsample) -library(inspectdf) -library(caTools) -library(pROC) - -#! =============================================== -#! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) -window_size <- 8 - -``` - - -```{r preprocessing} - -#! =============================================== -#! A function to mark lures in a sequence -with_lures <- function(stimulus, stimulus_type, n) { - sapply(1:length(stimulus), function(i) { - lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) - are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) - ifelse(are_valid_trials && stimulus[i] %in% lures, - "lure", - as.character(stimulus_type[i])) - }) -} - -#! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al -#! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% - group_by(participant, block, condition) %>% - mutate(n = ifelse(condition=='2-back',2,3)) %>% - mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% - mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), - ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), - ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), - vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), - sl = ifelse(is.na(sl), NA, sl), - tl = ifelse(is.na(tl), NA, tl), - ll = ifelse(is.na(ll), NA, ll), - al = ifelse(is.na(al), NA, al), - ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) -) %>% - nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% - mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% - mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% - mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% - mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% - mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% - ungroup() %>% - select(-participant,-block,-condition) %>% - unnest(local_stats) %>% - mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% - mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) - -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) - -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - - -``` - -```{r remove_highly_correlated_predictors} -# WIP: This is an extra step for non-pls methods to remove highly correlated predictors -cor_matrix <- cor(seqs[,-1]) -cor_high <- findCorrelation(cor_matrix, 0.8) -high_cor_remove <- row.names(cor_matrix)[cor_high] -#FIXME remove by column name -seqs.uncorr <- seqs %>% select(-high_cor_remove) -``` diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R new file mode 100644 index 0000000..adfb442 --- /dev/null +++ b/ccn2019/ccn2019-accuracy.R @@ -0,0 +1,108 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T +) + +pls.new_model <- train( + a ~ .-al-dp-cr-rt-correct, + data = train_data.imbalanced, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + +plot(varImp(pls.new_model), main="Variable Importance for Accuracy") + + +pls.old_model <- train( + a ~ t + n + v, + data = train_data, + method = "pls", + preProcess = c("center","scale"), + trControl = control +) + + +pls.old_model +pls.new_model +plot(varImp(pls.old_model)) + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.old_model, pch = "|") + +resamps <- resamples(list(old = pls.old_model, new = pls.new_model)) +summary(resamps) +dotplot(resamps, metric = "Rsquared") +difValues <- diff(resamps) +bwplot(difValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.old_predicted <- predict(pls.old_model, test_data, type="raw") + + +summary(pls.new_model) + + +# SSE and RMSE + +SSE <- sum((test_data$a - pls.new_predicted)^2) # sum of squared errors +SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.new_predicted)^2) +RMSE <- sqrt(SSE/length(pls.new_predicted)) + + +SSE <- sum((test_data$a - pls.old_predicted)^2) +R_square <- 1 - SSE/SST +SSE <- sum((test_data$a - pls.old_predicted)^2) +RMSE <- sqrt(SSE/length(pls.old_predicted)) + + +as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=lm,se=F) + + ggtitle("Accuracy: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R new file mode 100644 index 0000000..9b9d9fb --- /dev/null +++ b/ccn2019/ccn2019-correct.R @@ -0,0 +1,148 @@ +#==================================================# +# model the "correct" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(pls) + +#devtools::install_github("sachsmc/plotROC") +library(plotROC) + + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +inspect_cat(seqs.imputed) +inspect_num(seqs.imputed) + +seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) + +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + classProbs = T, + verboseIter = T, + savePredictions = T, + sampling = "down", + selectionFunction = "oneSE" +) + +pls.new_model <- train( + correct ~ .-a-al-dp-cr-rt, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)") + +pls.common_model <- train( + correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l, + data = train_data, + method = "pls", + metric = "Accuracy", + tuneLength = 20, + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.common_model) +plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)") + + +#trellis.par.set(caretTheme()) +#densityplot(pls.new_model, pch = "|") +#densityplot(pls.common_model, pch = "|") + +# Compile models and compare performance +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +#DEBUG summary(pls.models) +#DEBUG dotplot(pls.models) +#DEBUG diffValues <- diff(resamps) +bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance") + + +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") +pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob") + +confusionMatrix(pls.new_predicted, test_data$correct) +confusionMatrix(pls.common_predicted, test_data$correct) + +library(pROC) +par(pty="s") + +roc(test_data$correct, + pls.common_predicted_prob$CORRECT, + plot = T, + legacy.axes=T, + lwd=2, + col="darkgrey", + lty = 3, + print.auc = T, + print.auc.y = 45, + print.auc.x = 55, + percent = T, + ci = T, + boot.n = 100 + ) + + + +# roc_test_indices <- createDataPartition(test_data$correct, +# times = 10, +# p = 0.9, +# list = F) + +#for (i in 1:ncol(roc_test_indices)) { +# test_sample_correct <- test_data[roc_test_indices[,i],]$correct +# predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT + +# plot.roc(test_sample_correct, +# predprob_sample_correct, +roc(test_data$correct, + pls.new_predicted_prob$CORRECT, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + add=T, + of = "se", + boot.n = 100, + ci = T) + +#} + +legend(100,100, legend=c("New Model", "Common Model"), + col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9) + +# requires plotROC package +#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) + +#DEBUG geom_roc() diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R new file mode 100644 index 0000000..5bcefba --- /dev/null +++ b/ccn2019/ccn2019-criterion.R @@ -0,0 +1,119 @@ +#==================================================# +# model the "accuract" column (a for global, and al for local accuracy) + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + cr ~ .-a-al-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) + +ggplot(varImp(pls.new_model)) + + labs(title="Criterion - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R new file mode 100644 index 0000000..421a43f --- /dev/null +++ b/ccn2019/ccn2019-feature-selection.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +#if (toString(f[[2]]) == "correct") +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]]) + + +# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"]) +# seqs.train.x <- upSamples %>% dplyr::select(-correct) + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +ctrl <- trainControl(method="cv", + number = 3, + verbose = T) + +model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) + +rmProfile <- rfe(seqs.train.x, seqs.train.y, + rfeControl = ctrl) + +summary(model) +rmProfile diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R new file mode 100644 index 0000000..f757596 --- /dev/null +++ b/ccn2019/ccn2019-penalized-auc.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) +library(ppls) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- correct ~ n + t + stimulus_type +f <- correct ~ n + tl + vl + sl + s + stimulus_type + +# predictors selected with stepAIC +f <- correct ~ n + tl + t + l + stimulus_type +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +set.seed(10) + +ctrl <- trainControl( + method = "cv", + number = 10, + classProbs = T, + summaryFunction = twoClassSummary, + sampling = "down" +) + +grid <- expand.grid( + alpha = seq(0,1,length=10), + lambda = seq(0.0001, 0.1, length=10) +) + +model <- train(seqs.train.x, seqs.train.y, + method = "glmnet", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + tuneGrid = grid, + metric = "ROC", + trControl = ctrl) + +seqs.test.y_prob <- predict(model, seqs.test.x, type="prob") + +model +plot(varImp(model, useModel = F)) +plot(model) +max(model$results$ROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 2000, + ci = T) + + + +# PPLS +#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T) + diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R new file mode 100644 index 0000000..73e2fd3 --- /dev/null +++ b/ccn2019/ccn2019-penalized-rt.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) +library(pROC) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) +seqs <- seqs %>% drop_na(rt, correct, tl,sl) + +f <- rt ~ n + t + v + +f <- rt ~ n + tl + v + s + l + +set.seed(654321) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl( + method = "cv", + number = 10 +) + +grid <- expand.grid( + ncomp = 1:3 +) + +model <- train(seqs.train.x, seqs.train.y, + method = "pls", + preProcess = c("nzv","center","scale"), # or c("knnImpute"), + #tuneGrid = grid, + tuneLength = 10, + metric = "RMSE", + trControl = ctrl) + +seqs.test.predicted_y <- predict(model, seqs.test.x) + +model +plot(varImp(model, useModel = F, scale=F)) +plot(model) +max(model$results$RMSE) + +# RT +data.frame( + RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y), + Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y) +) diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R new file mode 100644 index 0000000..4f6defb --- /dev/null +++ b/ccn2019/ccn2019-rt.R @@ -0,0 +1,117 @@ +#==================================================# +# model the "RT" column + +library(here) +library(tidyverse) +library(caret) +library(inspectdf) +library(skimr) +library(ROSE) + +load(here("notebooks/data/nback_seqs.Rd")) + +set.seed(42) + +seqs.imputed <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT"))) + +#DEBUG inspect_num(seqs.imputed) + +#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed) + + +#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1] + +train_indexes <- createDataPartition(seqs.imputed$correct, + times = 1, + p = 0.7, + list = F) +train_data <- seqs.imputed[train_indexes,] +test_data <- seqs.imputed[-train_indexes,] + +train_data.imbalanced <- ROSE(correct ~ ., + data = train_data, + seed = 1)$data + +# VIsualize split +train_data.imbalanced$grp <- "train" +test_data$grp <- "test" + +rbind(train_data.imbalanced, test_data) %>% + ggplot(aes(x=correct, fill=grp)) + + geom_histogram(stat="count", position='dodge') + + labs(title="Imbalanced Split") + +control <- trainControl( + method = "repeatedcv", + number = 5, + repeats = 2, + verboseIter = T, + savePredictions = T +) + +train_data <- train_data.imbalanced %>% select(-grp) + +pls.new_model <- train( + rt ~ .-a-al-cr-dp-rt-correct, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +plot(pls.new_model) +summary(pls.new_model) +plot(varImp(pls.new_model), main="Reaction Time - Variable Importance") + +pls.common_model <- train( + cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl, + data = train_data, + method = "pls", + preProcess = c("zv","center","scale"), + trControl = control +) + +summary(pls.common_model) +plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)") + +trellis.par.set(caretTheme()) +densityplot(pls.new_model, pch = "|") +densityplot(pls.common_model, pch = "|") + +pls.models <- resamples(list(new = pls.new_model, common = pls.common_model)) +summary(pls.models) +dotplot(pls.models, metric = "Rsquared") +diffValues <- diff(pls.models) +bwplot(diffValues, layout=c(1,3)) + + +pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw") +pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw") +pls.new_predicted <- predict(pls.new_model, test_data, type="raw") +pls.common_predicted <- predict(pls.common_model, test_data, type="raw") + +# SSE and RMSE +# +# SSE <- sum((test_data$cr - pls.new_predicted)^2) # sum of squared errors +# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.new_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.new_predicted)) +# +# +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# R_square <- 1 - SSE/SST +# SSE <- sum((test_data$cr - pls.common_predicted)^2) +# RMSE <- sqrt(SSE/length(pls.common_predicted)) +# + +as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>% + ggplot(aes(predicted, observed)) + + coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) + + geom_point(alpha = 0.1,shape=16) + + geom_smooth(method=glm) + + ggtitle("Criterion: Predicted vs Actual") + + xlab("Predecited") + + ylab("Observed") diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R new file mode 100644 index 0000000..a8f4a30 --- /dev/null +++ b/ccn2019/ccn2019-svm.R @@ -0,0 +1,79 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll + +set.seed(654321) + +seqs <- seqs %>%drop_na(rt, correct, tl,sl) + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + +ctrl <- trainControl(method="cv", + number=10, +# repeats = 1, #repeatedcv + sampling = "up", + savePredictions = T, + verbose = T) + +tune <- expand.grid(C = seq(0,5,by=0.25)) + +model <- train(seqs.train.x, + seqs.train.y, + method = "svmLinear", + preProc = c("nzv","center", "scale"), + tuneLength = 10, + tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + + +# RT +# data.frame( +# RMSE = RMSE(y.test, seqs.test$correct), +# Rsquare = R2(y.test, seqs.test$correct) +# ) + diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd new file mode 100644 index 0000000..9220ff4 --- /dev/null +++ b/ccn2019/ccn2019.rev0.Rmd @@ -0,0 +1,581 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: inline +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(plsRglm) +library(plsdof) +library(caret) +``` + +```{r params} +load('./data/CL2015.RData') + +window_size <- 8 +``` + + + +```{r history} + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +with_history <- function(stimuli, length=16, fixed=F) { + seq <- paste(stimuli, collapse = '') + + sapply(1:length(stimuli), function(i) { + stri_reverse(str_sub(seq, max(1,i-length+1), i)) + }) + #ifelse(fixed, h[str_length(h)==size], h) +} + +# $x_{s,local}$ +with_skewness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + sum(sort(freqs, decreasing = T)[1:2]) - 1 + }) +} + +# $x_{u,local}$ +with_lumpiness <- function(history) { + sapply(history, function(h) { + freqs <- table(unlist(str_split(h,""))) + max(freqs) - 1 + }) +} + + +with_targets_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="target"]) / length(trials) + }) +} + +with_lures_ratio <- function(stimulus_type, length=16) { + sapply(1:length(stimulus_type), function(i) { + trials <- stimulus_type[max(1,i-length):i] + length(trials[trials=="lure"]) / length(trials) + }) +} + +#TODO change to list column workflow with broom for model fitting and evaluating the fits +# duh! we are using list nested insided a tibble, so put all new columns in a new list column +# instead of adding a new column for each. +NB2 <- NB %>% + group_by(participant, condition, block) %>% + nest() %>% unnest(data) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(history = with_history(stimulus, window_size)) %>% + mutate(x_sl = with_skewness(history)) %>% + mutate(x_ul = with_lumpiness(history)) %>% + mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% + mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% + ungroup() + +pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) +NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# caret +library(caret) +# Compile cross-validation settings + + +any(is.na(NB2)) +NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + + +plsResult +``` + + + +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` + + + +--- +title: "PLS Training" +output: html_notebook +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +```{r plsrglm} +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` + + + diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd new file mode 100644 index 0000000..9074227 --- /dev/null +++ b/ccn2019/ccn2019.rev1.Rmd @@ -0,0 +1,281 @@ +--- +title: "Evaluating N-Back Sequences" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(tidyverse) +library(ggplot2) +library(stringi) +library(GA) +library(dbscan) +library(inspectdf) + +load('./data/CL2015.RData') +``` + +### Variables +- $T$ number of targets +- $L$ number of lures +- $S$ Skewness score +- $U$ Uniformity (!repetition) +- $RT_{mean}$ +- $Accuracy_{mean}$ +- $dprime$ +- $criterion$ + +## Constraints + +- fixed number of targets +- fixed number of lures (a.k.a, foils) +- uniform distribution of choices +- controlled local lumpiness + + +Each constraint is an up side down quadratic function to be minimized. + +```{r, eval=F} +targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) +skewness_cost <- function(x, choices) { + uniform_ratio <- length(x) / length(choices) + deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) + for (c in choices) { + deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) + } + #TODO convert to gaussian loss + max(deviation_from_uniform) +} + +lumpiness_cost <- function(x, choices) { + #trials = len(seq) + #freqs = [float(seq.count(c)) for c in choices] + #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) + #return ralph_skewed + NA +} + +#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) +#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) +#plot(GA) +``` + + +```{r} + +with_lures <- function(condition, stim, stim_type, history = NA) { + sapply(1:length(stim), + function(i) { + switch(as.character(condition[i]), + "2-back" = { + ifelse( + stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), + 'lure', + as.character(stim_type[i]) + )}, + "3-back" = { + ifelse( + stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), + 'lure', + as.character(stim_type[i]) + )} + ) + + }) +} + +with_targets_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-stri_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="target"]) + }) +} + +with_lures_ratio <- function(stimulus_type, history) { + sapply(1:length(history), function(i) { + trials <- stimulus_type[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + length(trials[trials=="lure"]) + }) +} + +with_lumpiness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + max(table(trials)) - 1 + }) +} + +with_lag <- function(stimulus, history) { + # find last occurance the of stimulus +} + +with_skewness_score <- function(stimulus, history) { + sapply(1:length(history), function(i) { + trials <- stimulus[(i-str_length(history[i])):i] + trials <- unlist(trials, use.names=FALSE) + sum(sort(table(trials), decreasing = T)[1:2]) - 1 + }) +} + +with_history <- function(stims, size=16) { + res <- c('') + for (i in 2:length(stims)) { + res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) + } + #res <- ifelse(stri_length(res)==size, res, NA) + res +} + +normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { + #TODO + sapply(1:length(targets_ratio), function(i) 0) +} + +window_size <- 8 + +NB_modified <- NB %>% + group_by(participant, condition, block) %>% + mutate(history = with_history(stimulus, size=window_size)) %>% + #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) + mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% + mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% + mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% + mutate(skewness = with_skewness_score(stimulus, history)) %>% + mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% + filter(stri_length(history)==window_size) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% + ungroup() + +pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) + +NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + + +## participant-level averaged NB, a single row represent an observation for a single subject +## in a single condition +NB_avg <- NB_modified %>% + group_by(participant, condition) %>% + mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% + summarise( + targets=sum(targets), + lures=sum(lures), + skewness=sum(skewness), + lumpiness=sum(lumpiness), + rt = mean(rt, na.rm=T), + correct=sum(correct,na.rm=T)/90) %>% + ungroup() + +# print +# NB_modified %>% +# filter(participant=='P1') %>% +# View() +# + + +fit <- lm(correct ~ t * s * u * l * d, NB_modified) + +``` + + +```{r} + +# DBSCAN Clustering (RT+ACCURACY against skewness) +NB_avg <- NB_avg %>% + mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) + +NB_avg %>% + ggplot(aes(skewness, correct, color=factor(cluster))) + + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + +``` + + +```{r} +## single-subject figures +NB_modified %>% + ggplot(aes(t,s,color=correct)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + inspect_cor(show_plot = T) + +NB_avg %>% + inspect_cor(show_plot = T) + +NB_modified %>% + ggplot(aes(rt,correct,color=u)) + + geom_jitter() + + geom_point() + + stat_summary(fun.y="mean") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(t),color=correct)) + + geom_jitter() + + geom_point(alpha=0.1) + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(s),jitter(u),color=correct)) + + geom_jitter() + + geom_point() + + facet_wrap(~condition) + +# rt/accuracy and lures +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + + geom_jitter() + + geom_point(shape=16) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,pc2,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,rt,color=correct)) + + geom_point(alpha=0.3) + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + +NB_modified %>% + filter(!is.na(correct)) %>% + ggplot(aes(pc1,correct,color=correct)) + + geom_point() + + geom_smooth(method="lm",se = F) + + facet_wrap(~condition, scales="free") + + +``` + +## TODO + - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), + - constraint3=fitness(history)) + - kmeans(NB) + - ggplot(kmeans_clusters$accuracy) + - ggplot(kmeans_clusters$rt) + - + + + + +```{python} +a=2 +p +``` diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd new file mode 100644 index 0000000..7e19f45 --- /dev/null +++ b/ccn2019/ccn2019.rev2.Rmd @@ -0,0 +1,229 @@ +--- +title: "Statistical Properties of the N-Back Sequences" +output: + html_notebook: default + pdf_document: default +editor_options: + chunk_output_type: console +--- + +# Problems + +Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. + +# Gaps + +- Unclear how to parameterize interesting variations for sequence generation +- How do we model these multiple strategies (which requires identifying which sequence variations matter) + - local vs. global properties, which one matters the most? + - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity + - Global: No lures, large vocabulary -> pattern repeats implies a target + + +## Formulating Generating the N-Back Sequences as a CSP instance + +$P=\langle V,D,C,W\rangle$ + +$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ + +$D=\{\}$ + + +Constraints: + +$$ +\\ + +x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| + +\\\\ + +x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| + +\\\\ + +x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| + +\\\\ + +x_{l} = L \times trials +\\\\ + +x_{ll} = L \times w +\\\\ + +x_{v} = |V| +\\ + +x_{ul} = w +\\\\ + +x_{s} = {trials \over |V|} +\\\\ + +x_{sl} = max(1, {w \over |V|}) +\\\\ + +x_{g} = {trials \over w} + +\\\\ + +x_{vl} = min(|V|, w) +$$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +#library(plsRglm) +#library(plsdof) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) + +``` + +```{r preprocessing} + +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size), + ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size), + sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), 0, sl), + ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size), + al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map(local_stats, ~length(unique(.x$stimulus)))) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) + +View() +inspectdf::inspect_cor(seqs) +#inspect_cor(NB,show_plot = T) +``` + +```{r} +model1 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a,data=., family = "gaussian") +aug1 <- augment(model1) +aug1 %>% + ggplot(aes(a,rt)) + + geom_point() + + geom_smooth(aes(y=.fitted, color='red')) +``` + +```{r} +model2 <- NB2 %>% + select(-participant, -stimulus) %>% + glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian") +aug2 <- augment(model2) +aug2 %>% + ggplot(aes(jitter(al),rt)) + + geom_point(alpha=0.2,shape=18) + + xlab("accuracy") + + geom_smooth(aes(y=.fitted), color='blue') + + geom_smooth(aes(y=aug1$.fitted), color='red') + +``` + +```{r models} + +nb_split <- initial_split(NB2, prop = 0.75) +training_data <- training(nb_split) +testing_data <- testing(nb_split) +cv_split <- vfold_cv(training_data, v = 5) +cv_data <- cv_split %>% + mutate( + train = map(splits, ~training(.x)), + validate = map(splits, ~testing(.x)) + ) + +cv_models_lm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_glm_a <- cv_data %>% + mutate(model = map(train, ~lm(formula = a~., data = .x)), + tidied = map(model, tidy), + glanced = map(model, glance), + augment = map(model, augment)) + +cv_models_pls_a <- cv_data %>% + mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")), + best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>% + mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims)) + ) + +head(cv_models_pls_a) + + +cv_models_pls_a1 <- cv_data[3][[1]] + + +NBx <- NB %>% + group_by(participant) %>% + summarise(freq = as.data.frame(table(stimulus))) + +ggplot(NBx$freq, aes(, group=participant)) + + geom_point(se = F) + + +#%>% +# mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV"))) + #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>% + #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) + +# Compile cross-validation settings + +#any(is.na(NB2)) +#NB2 <- na.omit(NB2) + +# set.seed(100) +# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) +# +# # PLS +# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], +# method = "pls", +# metric = "Accuracy", +# tuneLength = 20, +# trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), +# preProc = c("zv","center","scale")) +# +# # Check CV +# plot(mod1) + + +#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) +#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) + +``` diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd new file mode 100644 index 0000000..ba1b312 --- /dev/null +++ b/ccn2019/ccn2019.rev3.Rmd @@ -0,0 +1,100 @@ + +$P=\langle V,D,C,W \rangle$ + +```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +#! =============================================== +#! load required packages + +library(ggplot2) +library(tidyverse) +library(stringi) +library(pls) +library(caret) +library(here) +library(tsibble) +library(broom) +library(rsample) +library(inspectdf) +library(caTools) +library(pROC) + +#! =============================================== +#! load data set and set running window size +load(here('notebooks/data/CL2015.RData')) +window_size <- 8 + +``` + + +```{r preprocessing} + +#! =============================================== +#! A function to mark lures in a sequence +with_lures <- function(stimulus, stimulus_type, n) { + sapply(1:length(stimulus), function(i) { + lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) + are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) + ifelse(are_valid_trials && stimulus[i] %in% lures, + "lure", + as.character(stimulus_type[i])) + }) +} + +#! =============================================== +#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! a and al are respectively accuracy and recent accuracy +seqs <- NB %>% + group_by(participant, block, condition) %>% + mutate(n = ifelse(condition=='2-back',2,3)) %>% + mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% + mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), + ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), + ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size), + vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size), + al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), + sl = ifelse(is.na(sl), NA, sl), + tl = ifelse(is.na(tl), NA, tl), + ll = ifelse(is.na(ll), NA, ll), + al = ifelse(is.na(al), NA, al), + ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) +) %>% + nest(.key='local_stats') %>% + #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% + mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% + mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% + mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% + mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% + mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% + # mutate(dp = map_dbl(local_stats, + # ~length(which(.x$stimulus_type=="target" & .x$correct)) - + # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% + # mutate(cr = map_dbl(local_stats, + # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + + # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% + mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% + #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + ungroup() %>% + select(-participant,-block,-condition) %>% + unnest(local_stats) %>% + mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% + mutate(stimulus = factor(stimulus)) %>% + mutate(stimulus_type = factor(stimulus_type)) + +save(seqs,file=here("notebooks/data/nback_seqs.Rd")) + +#! =============================================== +#! visualize correlations +#DEBUG inspect_cor(seqs, show_plot = T) + + +``` + +```{r remove_highly_correlated_predictors} +# WIP: This is an extra step for non-pls methods to remove highly correlated predictors +cor_matrix <- cor(seqs[,-1]) +cor_high <- findCorrelation(cor_matrix, 0.8) +high_cor_remove <- row.names(cor_matrix)[cor_high] +#FIXME remove by column name +seqs.uncorr <- seqs %>% select(-high_cor_remove) +``` diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R new file mode 100644 index 0000000..7da6569 --- /dev/null +++ b/ccn2019/ccn2019_diagrams.R @@ -0,0 +1,51 @@ +NB_avg %>% + mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% + ggplot(aes(targets, accuracy, color=factor(cluster))) + + ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + #geom_smooth(method='lm', se = F) + + facet_wrap(~condition) + + +NB_avg %>% + ggplot(aes(lures, accuracy, color=condition)) + + ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(skewness, accuracy, color=condition)) + + ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, accuracy, color=condition)) + + ggtitle("lumpiness", "NOTE: each point is a single participant") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +NB_avg %>% + ggplot(aes(lumpiness, rts, color=condition)) + + ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + + xlab("lumpiness") + + ylab("Average RT") + + geom_point(alpha=0.3) + + geom_smooth(method='lm', se = F) + +nback <- NB_modified + +nback %>% + mutate(block=as.factor(block)) %>% + mutate(trial=as.factor(trial)) %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + #filter(condition=='3-back') %>% + #mutate(correct=as.numeric(correct)) %>% + inspect_cor(show_plot = T) + +averaged_nback <- NB_avg + +averaged_nback %>% + mutate(condition=ifelse(condition=='2-back',2,3)) %>% + inspect_cor(show_plot = T) + diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R new file mode 100644 index 0000000..9bbe23a --- /dev/null +++ b/ccn2019/dummy-vars-playground.R @@ -0,0 +1,111 @@ +library(tidyverse) +library(caret) +library(here) +library(inspectdf) +library(glmnet) +library(ROSE) + +rm(seqs) +load(here("notebooks/data/nback_seqs.Rd")) + +# seqs %>% +# ggplot(aes(x=v,y=a,col=correct)) + +# geom_jitter() + +# geom_point(alpha=0.1) + +# geom_smooth() + +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus +f <- correct ~ n + t + v + stimulus + +set.seed(654321) + +# 1. dummy vars +# INPUTS : seqs +# OUTPUTS: seqs.dmy + +seqs <- seqs %>% + drop_na(rt, correct, tl,sl) + + +train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) + +seqs.train.balanced <- seqs[train.indices,] +seqs.train <- seqs.train.balanced +# seqs.train <- ROSE(f, data = seqs.train.balanced)$data + +seqs.train.x <- model.matrix(f, seqs.train)[,-1] +seqs.train.y <- seqs.train[[toString(f[[2]])]] + +seqs.test <- seqs[-train.indices,] +seqs.test.x <- model.matrix(f, seqs.test)[,-1] +seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] + + +# ROC for each var +filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) + +# model <- cv.glmnet(seqs.train.x, +# seqs.train.y, +# alpha = 1, +# nfolds = 5, +# family = "binomial", +# type.measure = "auc") +# +# model$lambda.min + +ctrl <- trainControl(method="cv", + number=1, + classProbs=T, + verbose = T, +# sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +# glmnet tune +tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) + +max_components <- n_distinct(attr(terms(f),"term.labels")) +# pls tune +tune <- expand.grid(ncomp=1:max_components) + +model <- train(seqs.train.x, + seqs.train.y, + method = "glmnet", + #family = "binomial", + #metric = "ROC", + preProc = c("nzv","center", "scale"), + #verboseIter = TRUE, + tuneLength = 2, + #tuneGrid = tune, + trControl = ctrl) + +model$bestTune +plot(model) + +seqs.test.y <- model %>% predict(seqs.test.x) +seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") + +confusionMatrix(seqs.test.y, seqs.test.observed_y) + +plot(varImp(model, scale = F, useModel = F)) + +library(pROC) + + +roc(seqs.test.observed_y, + seqs.test.y_prob$YES, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "auc", + boot.n = 100, + ci = T) + + + diff --git a/ccn2019/pls_playground.Rmd b/ccn2019/pls_playground.Rmd new file mode 100644 index 0000000..c566656 --- /dev/null +++ b/ccn2019/pls_playground.Rmd @@ -0,0 +1,104 @@ +--- +title: "PLS Training" +output: html_notebook +editor_options: + chunk_output_type: console +--- + +PLS: + + +```{r} +#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select +library(tidyverse) +library(pls) + +## 1. load sample data +#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") + +rm(NB) +load("./data/CL2015.RData") +data <- NB +str(data) + +## 2. clean data (remove brand and URLID) +data <- data %>% + mutate(n=ifelse(condition=='2-back', 2, 3)) %>% + select(-condition, + -stimulus, + -block, + -trial) +# %>% +# rename( +# ev.participant=participant, +# ev.n=n, +# ev.block=block, +# ev.stimulus_type=stimulus_type, +# rv.choice=choice, +# rv.rt=rt, +# rv.correct=correct +# ) + +## 3. use cross validatation to find the optimal number of dimensions +pls.model = plsr(rt ~ ., data = data, validation = "CV") + +## 3.1. find the model with lowest cv error + +best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1 + +## 4. rebuild the model +pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) + +## 5. Sort, and visualize top coefficients +coefs <- coef(pls.model) + +barplot(sort(coefs[,1,1], decreasing = T)[1:4]) +``` + + +```{r simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +pls.model <- plsr(Y ~ X, validation = "CV") + +cv <- RMSEP(pls.model) +best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 +pls.model <- plsr(Y ~ X, ncomp = best_dims) +coefs <- sort(coef(pls.model)[,1,1], decreasing = T) + +barplot(coefs) + +``` + + +```{r cca-simulate} +X <- matrix(rnorm(1100), 100, 11) +Y <- matrix(rnorm(400), 100, 4) + +M <- cor(cbind(X,Y)) +corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") +cc <- cancor(X, Y) + +#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) + +``` + + +``` +rm(list = ls()) +library(plsRglm) + +data(Cornell) +df <- Cornell +x <- subset(df, select = -c(Y)) +y <- df$Y +## K is the number of folds in CV, and nt is the maximum number of components, +#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) + +modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) +res.cv.modpls<-cvtable(summary(cv.modpls)) + +res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) + +``` \ No newline at end of file diff --git a/ccn2019_diagrams.R b/ccn2019_diagrams.R deleted file mode 100644 index 7da6569..0000000 --- a/ccn2019_diagrams.R +++ /dev/null @@ -1,51 +0,0 @@ -NB_avg %>% - mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>% - ggplot(aes(targets, accuracy, color=factor(cluster))) + - ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - #geom_smooth(method='lm', se = F) + - facet_wrap(~condition) - - -NB_avg %>% - ggplot(aes(lures, accuracy, color=condition)) + - ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(skewness, accuracy, color=condition)) + - ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(lumpiness, accuracy, color=condition)) + - ggtitle("lumpiness", "NOTE: each point is a single participant") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -NB_avg %>% - ggplot(aes(lumpiness, rts, color=condition)) + - ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") + - xlab("lumpiness") + - ylab("Average RT") + - geom_point(alpha=0.3) + - geom_smooth(method='lm', se = F) - -nback <- NB_modified - -nback %>% - mutate(block=as.factor(block)) %>% - mutate(trial=as.factor(trial)) %>% - mutate(condition=ifelse(condition=='2-back',2,3)) %>% - #filter(condition=='3-back') %>% - #mutate(correct=as.numeric(correct)) %>% - inspect_cor(show_plot = T) - -averaged_nback <- NB_avg - -averaged_nback %>% - mutate(condition=ifelse(condition=='2-back',2,3)) %>% - inspect_cor(show_plot = T) - diff --git a/dummy-vars-playground.R b/dummy-vars-playground.R deleted file mode 100644 index d79b629..0000000 --- a/dummy-vars-playground.R +++ /dev/null @@ -1,114 +0,0 @@ -library(tidyverse) -library(caret) -library(here) -library(inspectdf) -library(glmnet) -library(ROSE) - -rm(seqs) -load(here("notebooks/data/nback_seqs.Rd")) - -# seqs %>% -# ggplot(aes(x=v,y=a,col=correct)) + -# geom_jitter() + -# geom_point(alpha=0.1) + -# geom_smooth() - -f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll - -set.seed(654321) - -# 1. dummy vars -# INPUTS : seqs -# OUTPUTS: seqs.dmy - -seqs <- seqs %>% - drop_na(rt, correct, tl,sl) - - -train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) - -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced -# seqs.train <- ROSE(f, data = seqs.train.balanced)$data - -seqs.train.x <- model.matrix(f, seqs.train)[,-1] -seqs.train.y <- seqs.train[[toString(f[[2]])]] - -seqs.test <- seqs[-train.indices,] -seqs.test.x <- model.matrix(f, seqs.test)[,-1] -seqs.test.observed_y <- seqs.test[[toString(f[[2]])]] - - -# ROC for each var -filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) - -# model <- cv.glmnet(seqs.train.x, -# seqs.train.y, -# alpha = 1, -# nfolds = 5, -# family = "binomial", -# type.measure = "auc") -# -# model$lambda.min - -ctrl <- trainControl(method="cv", - number=5, - classProbs=T, - verbose = T, -# sampling = "up", - savePredictions = T, - summaryFunction=twoClassSummary) - -# glmnet tune -tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) - -max_components <- n_distinct(attr(terms(f),"term.labels")) -# pls tune -tune <- expand.grid(ncomp=1:max_components) - -model <- train(seqs.train.x, - seqs.train.y, - method = "pls", - family = "binomial", - metric = "ROC", - preProc = c("center", "scale"), - verboseIter = TRUE, - tuneLength = 10, - tuneGrid = tune, - trControl = ctrl) - -model$bestTune -plot(model) - -seqs.test.y <- model %>% predict(seqs.test.x) -seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob") - -confusionMatrix(seqs.test.y, seqs.test.observed_y) - -plot(varImp(model, scale = F, useModel = F)) - -library(pROC) - -roc(seqs.test.observed_y, - seqs.test.y_prob$YES, - legacy.axes=T, - plot = T, - lwd=2, - col="black", - print.auc=T, - percent = T, - print.auc.y = 40, - print.auc.x = 55, - lty = 1, - of = "se", - boot.n = 200, - ci = T) - - -# RT -# data.frame( -# RMSE = RMSE(y.test, seqs.test$correct), -# Rsquare = R2(y.test, seqs.test$correct) -# ) - diff --git a/pls_playground.Rmd b/pls_playground.Rmd deleted file mode 100644 index c566656..0000000 --- a/pls_playground.Rmd +++ /dev/null @@ -1,104 +0,0 @@ ---- -title: "PLS Training" -output: html_notebook -editor_options: - chunk_output_type: console ---- - -PLS: - - -```{r} -#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select -library(tidyverse) -library(pls) - -## 1. load sample data -#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") - -rm(NB) -load("./data/CL2015.RData") -data <- NB -str(data) - -## 2. clean data (remove brand and URLID) -data <- data %>% - mutate(n=ifelse(condition=='2-back', 2, 3)) %>% - select(-condition, - -stimulus, - -block, - -trial) -# %>% -# rename( -# ev.participant=participant, -# ev.n=n, -# ev.block=block, -# ev.stimulus_type=stimulus_type, -# rv.choice=choice, -# rv.rt=rt, -# rv.correct=correct -# ) - -## 3. use cross validatation to find the optimal number of dimensions -pls.model = plsr(rt ~ ., data = data, validation = "CV") - -## 3.1. find the model with lowest cv error - -best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1 - -## 4. rebuild the model -pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) - -## 5. Sort, and visualize top coefficients -coefs <- coef(pls.model) - -barplot(sort(coefs[,1,1], decreasing = T)[1:4]) -``` - - -```{r simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -pls.model <- plsr(Y ~ X, validation = "CV") - -cv <- RMSEP(pls.model) -best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 -pls.model <- plsr(Y ~ X, ncomp = best_dims) -coefs <- sort(coef(pls.model)[,1,1], decreasing = T) - -barplot(coefs) - -``` - - -```{r cca-simulate} -X <- matrix(rnorm(1100), 100, 11) -Y <- matrix(rnorm(400), 100, 4) - -M <- cor(cbind(X,Y)) -corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") -cc <- cancor(X, Y) - -#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) - -``` - - -``` -rm(list = ls()) -library(plsRglm) - -data(Cornell) -df <- Cornell -x <- subset(df, select = -c(Y)) -y <- df$Y -## K is the number of folds in CV, and nt is the maximum number of components, -#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) - -modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) -res.cv.modpls<-cvtable(summary(cv.modpls)) - -res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) - -``` \ No newline at end of file