diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd
new file mode 100644
index 0000000..7e19f45
--- /dev/null
+++ b/ccn2019/ccn2019.rev2.Rmd
@@ -0,0 +1,229 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: console
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+#library(plsRglm)
+#library(plsdof)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+
+```
+
+```{r preprocessing}
+
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
+         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
+         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), 0, sl),
+         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
+         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
+         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition)
+
+View()
+inspectdf::inspect_cor(seqs)
+#inspect_cor(NB,show_plot = T)
+```
+
+```{r}
+model1 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a,data=., family = "gaussian")
+aug1 <- augment(model1)
+aug1 %>%
+  ggplot(aes(a,rt)) +
+    geom_point() +
+    geom_smooth(aes(y=.fitted, color='red'))
+```
+
+```{r}
+model2 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
+aug2 <- augment(model2)
+aug2 %>%
+  ggplot(aes(jitter(al),rt)) +
+    geom_point(alpha=0.2,shape=18) +
+    xlab("accuracy") + 
+    geom_smooth(aes(y=.fitted), color='blue') +
+    geom_smooth(aes(y=aug1$.fitted), color='red')
+  
+```
+
+```{r models}
+
+nb_split <- initial_split(NB2, prop = 0.75)
+training_data <- training(nb_split)
+testing_data <- testing(nb_split)
+cv_split <- vfold_cv(training_data, v = 5)
+cv_data <- cv_split %>% 
+  mutate(
+    train = map(splits, ~training(.x)), 
+    validate = map(splits, ~testing(.x))
+  )
+
+cv_models_lm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_glm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_pls_a <- cv_data %>%
+  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
+         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
+  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
+  )
+
+head(cv_models_pls_a)
+
+
+cv_models_pls_a1 <- cv_data[3][[1]]
+
+
+NBx <- NB %>%
+  group_by(participant) %>%
+  summarise(freq = as.data.frame(table(stimulus)))
+
+ggplot(NBx$freq, aes(, group=participant)) +
+  geom_point(se = F)
+
+
+#%>%
+#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
+  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
+  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# Compile cross-validation settings
+
+#any(is.na(NB2))
+#NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+```

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd
new file mode 100644
index 0000000..7e19f45
--- /dev/null
+++ b/ccn2019/ccn2019.rev2.Rmd
@@ -0,0 +1,229 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: console
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+#library(plsRglm)
+#library(plsdof)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+
+```
+
+```{r preprocessing}
+
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
+         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
+         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), 0, sl),
+         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
+         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
+         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition)
+
+View()
+inspectdf::inspect_cor(seqs)
+#inspect_cor(NB,show_plot = T)
+```
+
+```{r}
+model1 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a,data=., family = "gaussian")
+aug1 <- augment(model1)
+aug1 %>%
+  ggplot(aes(a,rt)) +
+    geom_point() +
+    geom_smooth(aes(y=.fitted, color='red'))
+```
+
+```{r}
+model2 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
+aug2 <- augment(model2)
+aug2 %>%
+  ggplot(aes(jitter(al),rt)) +
+    geom_point(alpha=0.2,shape=18) +
+    xlab("accuracy") + 
+    geom_smooth(aes(y=.fitted), color='blue') +
+    geom_smooth(aes(y=aug1$.fitted), color='red')
+  
+```
+
+```{r models}
+
+nb_split <- initial_split(NB2, prop = 0.75)
+training_data <- training(nb_split)
+testing_data <- testing(nb_split)
+cv_split <- vfold_cv(training_data, v = 5)
+cv_data <- cv_split %>% 
+  mutate(
+    train = map(splits, ~training(.x)), 
+    validate = map(splits, ~testing(.x))
+  )
+
+cv_models_lm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_glm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_pls_a <- cv_data %>%
+  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
+         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
+  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
+  )
+
+head(cv_models_pls_a)
+
+
+cv_models_pls_a1 <- cv_data[3][[1]]
+
+
+NBx <- NB %>%
+  group_by(participant) %>%
+  summarise(freq = as.data.frame(table(stimulus)))
+
+ggplot(NBx$freq, aes(, group=participant)) +
+  geom_point(se = F)
+
+
+#%>%
+#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
+  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
+  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# Compile cross-validation settings
+
+#any(is.na(NB2))
+#NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+```
diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd
new file mode 100644
index 0000000..ba1b312
--- /dev/null
+++ b/ccn2019/ccn2019.rev3.Rmd
@@ -0,0 +1,100 @@
+
+$P=\langle V,D,C,W \rangle$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+#! ===============================================
+#! load required packages
+
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+library(inspectdf)
+library(caTools)
+library(pROC)
+
+#! ===============================================
+#! load data set and set running window size
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+```
+
+
+```{r preprocessing}
+
+#! ===============================================
+#! A function to mark lures in a sequence
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+#! ===============================================
+#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
+#! a and al are respectively accuracy and recent accuracy
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
+         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
+         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
+         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
+         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), NA, sl),
+         tl = ifelse(is.na(tl), NA, tl),
+         ll = ifelse(is.na(ll), NA, ll),
+         al = ifelse(is.na(al), NA, al),
+         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
+) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
+  # mutate(dp = map_dbl(local_stats,
+  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
+  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
+  # mutate(cr = map_dbl(local_stats,
+  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
+  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
+  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition) %>%
+  unnest(local_stats) %>%
+  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
+  mutate(stimulus = factor(stimulus)) %>%
+  mutate(stimulus_type = factor(stimulus_type))
+
+save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
+
+#! ===============================================
+#! visualize correlations
+#DEBUG inspect_cor(seqs, show_plot = T)
+
+
+```
+
+```{r remove_highly_correlated_predictors}
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+cor_matrix <- cor(seqs[,-1])
+cor_high <- findCorrelation(cor_matrix, 0.8)
+high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+seqs.uncorr <- seqs %>% select(-high_cor_remove)
+```

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd
new file mode 100644
index 0000000..7e19f45
--- /dev/null
+++ b/ccn2019/ccn2019.rev2.Rmd
@@ -0,0 +1,229 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: console
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+#library(plsRglm)
+#library(plsdof)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+
+```
+
+```{r preprocessing}
+
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
+         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
+         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), 0, sl),
+         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
+         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
+         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition)
+
+View()
+inspectdf::inspect_cor(seqs)
+#inspect_cor(NB,show_plot = T)
+```
+
+```{r}
+model1 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a,data=., family = "gaussian")
+aug1 <- augment(model1)
+aug1 %>%
+  ggplot(aes(a,rt)) +
+    geom_point() +
+    geom_smooth(aes(y=.fitted, color='red'))
+```
+
+```{r}
+model2 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
+aug2 <- augment(model2)
+aug2 %>%
+  ggplot(aes(jitter(al),rt)) +
+    geom_point(alpha=0.2,shape=18) +
+    xlab("accuracy") + 
+    geom_smooth(aes(y=.fitted), color='blue') +
+    geom_smooth(aes(y=aug1$.fitted), color='red')
+  
+```
+
+```{r models}
+
+nb_split <- initial_split(NB2, prop = 0.75)
+training_data <- training(nb_split)
+testing_data <- testing(nb_split)
+cv_split <- vfold_cv(training_data, v = 5)
+cv_data <- cv_split %>% 
+  mutate(
+    train = map(splits, ~training(.x)), 
+    validate = map(splits, ~testing(.x))
+  )
+
+cv_models_lm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_glm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_pls_a <- cv_data %>%
+  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
+         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
+  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
+  )
+
+head(cv_models_pls_a)
+
+
+cv_models_pls_a1 <- cv_data[3][[1]]
+
+
+NBx <- NB %>%
+  group_by(participant) %>%
+  summarise(freq = as.data.frame(table(stimulus)))
+
+ggplot(NBx$freq, aes(, group=participant)) +
+  geom_point(se = F)
+
+
+#%>%
+#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
+  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
+  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# Compile cross-validation settings
+
+#any(is.na(NB2))
+#NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+```
diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd
new file mode 100644
index 0000000..ba1b312
--- /dev/null
+++ b/ccn2019/ccn2019.rev3.Rmd
@@ -0,0 +1,100 @@
+
+$P=\langle V,D,C,W \rangle$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+#! ===============================================
+#! load required packages
+
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+library(inspectdf)
+library(caTools)
+library(pROC)
+
+#! ===============================================
+#! load data set and set running window size
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+```
+
+
+```{r preprocessing}
+
+#! ===============================================
+#! A function to mark lures in a sequence
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+#! ===============================================
+#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
+#! a and al are respectively accuracy and recent accuracy
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
+         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
+         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
+         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
+         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), NA, sl),
+         tl = ifelse(is.na(tl), NA, tl),
+         ll = ifelse(is.na(ll), NA, ll),
+         al = ifelse(is.na(al), NA, al),
+         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
+) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
+  # mutate(dp = map_dbl(local_stats,
+  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
+  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
+  # mutate(cr = map_dbl(local_stats,
+  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
+  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
+  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition) %>%
+  unnest(local_stats) %>%
+  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
+  mutate(stimulus = factor(stimulus)) %>%
+  mutate(stimulus_type = factor(stimulus_type))
+
+save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
+
+#! ===============================================
+#! visualize correlations
+#DEBUG inspect_cor(seqs, show_plot = T)
+
+
+```
+
+```{r remove_highly_correlated_predictors}
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+cor_matrix <- cor(seqs[,-1])
+cor_high <- findCorrelation(cor_matrix, 0.8)
+high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+seqs.uncorr <- seqs %>% select(-high_cor_remove)
+```
diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R
new file mode 100644
index 0000000..7da6569
--- /dev/null
+++ b/ccn2019/ccn2019_diagrams.R
@@ -0,0 +1,51 @@
+NB_avg %>%
+  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
+  ggplot(aes(targets, accuracy, color=factor(cluster))) +
+  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+
+NB_avg %>%
+  ggplot(aes(lures, accuracy, color=condition)) +
+  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(skewness, accuracy, color=condition)) +
+  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, accuracy, color=condition)) +
+  ggtitle("lumpiness", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, rts, color=condition)) +
+  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
+  xlab("lumpiness") +
+  ylab("Average RT") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+nback <- NB_modified
+
+nback %>%
+  mutate(block=as.factor(block)) %>%
+  mutate(trial=as.factor(trial)) %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  #filter(condition=='3-back') %>%
+  #mutate(correct=as.numeric(correct)) %>%
+  inspect_cor(show_plot = T)
+
+averaged_nback <- NB_avg
+
+averaged_nback %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  inspect_cor(show_plot = T)
+

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd
new file mode 100644
index 0000000..7e19f45
--- /dev/null
+++ b/ccn2019/ccn2019.rev2.Rmd
@@ -0,0 +1,229 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: console
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+#library(plsRglm)
+#library(plsdof)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+
+```
+
+```{r preprocessing}
+
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
+         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
+         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), 0, sl),
+         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
+         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
+         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition)
+
+View()
+inspectdf::inspect_cor(seqs)
+#inspect_cor(NB,show_plot = T)
+```
+
+```{r}
+model1 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a,data=., family = "gaussian")
+aug1 <- augment(model1)
+aug1 %>%
+  ggplot(aes(a,rt)) +
+    geom_point() +
+    geom_smooth(aes(y=.fitted, color='red'))
+```
+
+```{r}
+model2 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
+aug2 <- augment(model2)
+aug2 %>%
+  ggplot(aes(jitter(al),rt)) +
+    geom_point(alpha=0.2,shape=18) +
+    xlab("accuracy") + 
+    geom_smooth(aes(y=.fitted), color='blue') +
+    geom_smooth(aes(y=aug1$.fitted), color='red')
+  
+```
+
+```{r models}
+
+nb_split <- initial_split(NB2, prop = 0.75)
+training_data <- training(nb_split)
+testing_data <- testing(nb_split)
+cv_split <- vfold_cv(training_data, v = 5)
+cv_data <- cv_split %>% 
+  mutate(
+    train = map(splits, ~training(.x)), 
+    validate = map(splits, ~testing(.x))
+  )
+
+cv_models_lm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_glm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_pls_a <- cv_data %>%
+  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
+         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
+  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
+  )
+
+head(cv_models_pls_a)
+
+
+cv_models_pls_a1 <- cv_data[3][[1]]
+
+
+NBx <- NB %>%
+  group_by(participant) %>%
+  summarise(freq = as.data.frame(table(stimulus)))
+
+ggplot(NBx$freq, aes(, group=participant)) +
+  geom_point(se = F)
+
+
+#%>%
+#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
+  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
+  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# Compile cross-validation settings
+
+#any(is.na(NB2))
+#NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+```
diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd
new file mode 100644
index 0000000..ba1b312
--- /dev/null
+++ b/ccn2019/ccn2019.rev3.Rmd
@@ -0,0 +1,100 @@
+
+$P=\langle V,D,C,W \rangle$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+#! ===============================================
+#! load required packages
+
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+library(inspectdf)
+library(caTools)
+library(pROC)
+
+#! ===============================================
+#! load data set and set running window size
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+```
+
+
+```{r preprocessing}
+
+#! ===============================================
+#! A function to mark lures in a sequence
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+#! ===============================================
+#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
+#! a and al are respectively accuracy and recent accuracy
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
+         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
+         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
+         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
+         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), NA, sl),
+         tl = ifelse(is.na(tl), NA, tl),
+         ll = ifelse(is.na(ll), NA, ll),
+         al = ifelse(is.na(al), NA, al),
+         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
+) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
+  # mutate(dp = map_dbl(local_stats,
+  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
+  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
+  # mutate(cr = map_dbl(local_stats,
+  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
+  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
+  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition) %>%
+  unnest(local_stats) %>%
+  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
+  mutate(stimulus = factor(stimulus)) %>%
+  mutate(stimulus_type = factor(stimulus_type))
+
+save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
+
+#! ===============================================
+#! visualize correlations
+#DEBUG inspect_cor(seqs, show_plot = T)
+
+
+```
+
+```{r remove_highly_correlated_predictors}
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+cor_matrix <- cor(seqs[,-1])
+cor_high <- findCorrelation(cor_matrix, 0.8)
+high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+seqs.uncorr <- seqs %>% select(-high_cor_remove)
+```
diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R
new file mode 100644
index 0000000..7da6569
--- /dev/null
+++ b/ccn2019/ccn2019_diagrams.R
@@ -0,0 +1,51 @@
+NB_avg %>%
+  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
+  ggplot(aes(targets, accuracy, color=factor(cluster))) +
+  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+
+NB_avg %>%
+  ggplot(aes(lures, accuracy, color=condition)) +
+  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(skewness, accuracy, color=condition)) +
+  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, accuracy, color=condition)) +
+  ggtitle("lumpiness", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, rts, color=condition)) +
+  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
+  xlab("lumpiness") +
+  ylab("Average RT") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+nback <- NB_modified
+
+nback %>%
+  mutate(block=as.factor(block)) %>%
+  mutate(trial=as.factor(trial)) %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  #filter(condition=='3-back') %>%
+  #mutate(correct=as.numeric(correct)) %>%
+  inspect_cor(show_plot = T)
+
+averaged_nback <- NB_avg
+
+averaged_nback %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  inspect_cor(show_plot = T)
+
diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R
new file mode 100644
index 0000000..9bbe23a
--- /dev/null
+++ b/ccn2019/dummy-vars-playground.R
@@ -0,0 +1,111 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+# seqs %>%
+#   ggplot(aes(x=v,y=a,col=correct)) +
+#   geom_jitter() +
+#   geom_point(alpha=0.1) +
+#   geom_smooth()
+
+f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus
+f <- correct ~ n + t + v + stimulus
+
+set.seed(654321)
+
+# 1. dummy vars
+# INPUTS : seqs
+# OUTPUTS: seqs.dmy
+
+seqs <- seqs %>% 
+  drop_na(rt, correct, tl,sl)
+
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+# seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+  
+
+# ROC for each var
+filterVarImp(as.data.frame(seqs.train.x), seqs.train.y)
+
+# model <- cv.glmnet(seqs.train.x,
+#                    seqs.train.y,
+#                    alpha = 1,
+#                    nfolds = 5,
+#                    family = "binomial",
+#                    type.measure = "auc")
+# 
+# model$lambda.min
+
+ctrl <- trainControl(method="cv",
+                     number=1, 
+                     classProbs=T,
+                     verbose = T,
+#                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+# glmnet tune
+tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100))
+
+max_components <- n_distinct(attr(terms(f),"term.labels"))
+# pls tune
+tune <- expand.grid(ncomp=1:max_components)
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "glmnet",
+               #family = "binomial",
+               #metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               #verboseIter = TRUE,
+               tuneLength = 2,
+               #tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "auc",
+    boot.n = 100,
+    ci = T)
+
+
+

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd
new file mode 100644
index 0000000..7e19f45
--- /dev/null
+++ b/ccn2019/ccn2019.rev2.Rmd
@@ -0,0 +1,229 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: console
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+#library(plsRglm)
+#library(plsdof)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+
+```
+
+```{r preprocessing}
+
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
+         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
+         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), 0, sl),
+         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
+         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
+         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition)
+
+View()
+inspectdf::inspect_cor(seqs)
+#inspect_cor(NB,show_plot = T)
+```
+
+```{r}
+model1 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a,data=., family = "gaussian")
+aug1 <- augment(model1)
+aug1 %>%
+  ggplot(aes(a,rt)) +
+    geom_point() +
+    geom_smooth(aes(y=.fitted, color='red'))
+```
+
+```{r}
+model2 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
+aug2 <- augment(model2)
+aug2 %>%
+  ggplot(aes(jitter(al),rt)) +
+    geom_point(alpha=0.2,shape=18) +
+    xlab("accuracy") + 
+    geom_smooth(aes(y=.fitted), color='blue') +
+    geom_smooth(aes(y=aug1$.fitted), color='red')
+  
+```
+
+```{r models}
+
+nb_split <- initial_split(NB2, prop = 0.75)
+training_data <- training(nb_split)
+testing_data <- testing(nb_split)
+cv_split <- vfold_cv(training_data, v = 5)
+cv_data <- cv_split %>% 
+  mutate(
+    train = map(splits, ~training(.x)), 
+    validate = map(splits, ~testing(.x))
+  )
+
+cv_models_lm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_glm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_pls_a <- cv_data %>%
+  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
+         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
+  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
+  )
+
+head(cv_models_pls_a)
+
+
+cv_models_pls_a1 <- cv_data[3][[1]]
+
+
+NBx <- NB %>%
+  group_by(participant) %>%
+  summarise(freq = as.data.frame(table(stimulus)))
+
+ggplot(NBx$freq, aes(, group=participant)) +
+  geom_point(se = F)
+
+
+#%>%
+#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
+  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
+  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# Compile cross-validation settings
+
+#any(is.na(NB2))
+#NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+```
diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd
new file mode 100644
index 0000000..ba1b312
--- /dev/null
+++ b/ccn2019/ccn2019.rev3.Rmd
@@ -0,0 +1,100 @@
+
+$P=\langle V,D,C,W \rangle$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+#! ===============================================
+#! load required packages
+
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+library(inspectdf)
+library(caTools)
+library(pROC)
+
+#! ===============================================
+#! load data set and set running window size
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+```
+
+
+```{r preprocessing}
+
+#! ===============================================
+#! A function to mark lures in a sequence
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+#! ===============================================
+#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
+#! a and al are respectively accuracy and recent accuracy
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
+         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
+         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
+         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
+         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), NA, sl),
+         tl = ifelse(is.na(tl), NA, tl),
+         ll = ifelse(is.na(ll), NA, ll),
+         al = ifelse(is.na(al), NA, al),
+         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
+) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
+  # mutate(dp = map_dbl(local_stats,
+  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
+  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
+  # mutate(cr = map_dbl(local_stats,
+  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
+  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
+  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition) %>%
+  unnest(local_stats) %>%
+  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
+  mutate(stimulus = factor(stimulus)) %>%
+  mutate(stimulus_type = factor(stimulus_type))
+
+save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
+
+#! ===============================================
+#! visualize correlations
+#DEBUG inspect_cor(seqs, show_plot = T)
+
+
+```
+
+```{r remove_highly_correlated_predictors}
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+cor_matrix <- cor(seqs[,-1])
+cor_high <- findCorrelation(cor_matrix, 0.8)
+high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+seqs.uncorr <- seqs %>% select(-high_cor_remove)
+```
diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R
new file mode 100644
index 0000000..7da6569
--- /dev/null
+++ b/ccn2019/ccn2019_diagrams.R
@@ -0,0 +1,51 @@
+NB_avg %>%
+  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
+  ggplot(aes(targets, accuracy, color=factor(cluster))) +
+  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+
+NB_avg %>%
+  ggplot(aes(lures, accuracy, color=condition)) +
+  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(skewness, accuracy, color=condition)) +
+  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, accuracy, color=condition)) +
+  ggtitle("lumpiness", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, rts, color=condition)) +
+  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
+  xlab("lumpiness") +
+  ylab("Average RT") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+nback <- NB_modified
+
+nback %>%
+  mutate(block=as.factor(block)) %>%
+  mutate(trial=as.factor(trial)) %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  #filter(condition=='3-back') %>%
+  #mutate(correct=as.numeric(correct)) %>%
+  inspect_cor(show_plot = T)
+
+averaged_nback <- NB_avg
+
+averaged_nback %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  inspect_cor(show_plot = T)
+
diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R
new file mode 100644
index 0000000..9bbe23a
--- /dev/null
+++ b/ccn2019/dummy-vars-playground.R
@@ -0,0 +1,111 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+# seqs %>%
+#   ggplot(aes(x=v,y=a,col=correct)) +
+#   geom_jitter() +
+#   geom_point(alpha=0.1) +
+#   geom_smooth()
+
+f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus
+f <- correct ~ n + t + v + stimulus
+
+set.seed(654321)
+
+# 1. dummy vars
+# INPUTS : seqs
+# OUTPUTS: seqs.dmy
+
+seqs <- seqs %>% 
+  drop_na(rt, correct, tl,sl)
+
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+# seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+  
+
+# ROC for each var
+filterVarImp(as.data.frame(seqs.train.x), seqs.train.y)
+
+# model <- cv.glmnet(seqs.train.x,
+#                    seqs.train.y,
+#                    alpha = 1,
+#                    nfolds = 5,
+#                    family = "binomial",
+#                    type.measure = "auc")
+# 
+# model$lambda.min
+
+ctrl <- trainControl(method="cv",
+                     number=1, 
+                     classProbs=T,
+                     verbose = T,
+#                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+# glmnet tune
+tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100))
+
+max_components <- n_distinct(attr(terms(f),"term.labels"))
+# pls tune
+tune <- expand.grid(ncomp=1:max_components)
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "glmnet",
+               #family = "binomial",
+               #metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               #verboseIter = TRUE,
+               tuneLength = 2,
+               #tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "auc",
+    boot.n = 100,
+    ci = T)
+
+
+
diff --git a/ccn2019/pls_playground.Rmd b/ccn2019/pls_playground.Rmd
new file mode 100644
index 0000000..c566656
--- /dev/null
+++ b/ccn2019/pls_playground.Rmd
@@ -0,0 +1,104 @@
+---
+title: "PLS Training"
+output: html_notebook
+editor_options: 
+  chunk_output_type: console
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+
+best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
\ No newline at end of file

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd
new file mode 100644
index 0000000..7e19f45
--- /dev/null
+++ b/ccn2019/ccn2019.rev2.Rmd
@@ -0,0 +1,229 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: console
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+#library(plsRglm)
+#library(plsdof)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+
+```
+
+```{r preprocessing}
+
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
+         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
+         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), 0, sl),
+         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
+         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
+         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition)
+
+View()
+inspectdf::inspect_cor(seqs)
+#inspect_cor(NB,show_plot = T)
+```
+
+```{r}
+model1 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a,data=., family = "gaussian")
+aug1 <- augment(model1)
+aug1 %>%
+  ggplot(aes(a,rt)) +
+    geom_point() +
+    geom_smooth(aes(y=.fitted, color='red'))
+```
+
+```{r}
+model2 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
+aug2 <- augment(model2)
+aug2 %>%
+  ggplot(aes(jitter(al),rt)) +
+    geom_point(alpha=0.2,shape=18) +
+    xlab("accuracy") + 
+    geom_smooth(aes(y=.fitted), color='blue') +
+    geom_smooth(aes(y=aug1$.fitted), color='red')
+  
+```
+
+```{r models}
+
+nb_split <- initial_split(NB2, prop = 0.75)
+training_data <- training(nb_split)
+testing_data <- testing(nb_split)
+cv_split <- vfold_cv(training_data, v = 5)
+cv_data <- cv_split %>% 
+  mutate(
+    train = map(splits, ~training(.x)), 
+    validate = map(splits, ~testing(.x))
+  )
+
+cv_models_lm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_glm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_pls_a <- cv_data %>%
+  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
+         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
+  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
+  )
+
+head(cv_models_pls_a)
+
+
+cv_models_pls_a1 <- cv_data[3][[1]]
+
+
+NBx <- NB %>%
+  group_by(participant) %>%
+  summarise(freq = as.data.frame(table(stimulus)))
+
+ggplot(NBx$freq, aes(, group=participant)) +
+  geom_point(se = F)
+
+
+#%>%
+#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
+  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
+  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# Compile cross-validation settings
+
+#any(is.na(NB2))
+#NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+```
diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd
new file mode 100644
index 0000000..ba1b312
--- /dev/null
+++ b/ccn2019/ccn2019.rev3.Rmd
@@ -0,0 +1,100 @@
+
+$P=\langle V,D,C,W \rangle$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+#! ===============================================
+#! load required packages
+
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+library(inspectdf)
+library(caTools)
+library(pROC)
+
+#! ===============================================
+#! load data set and set running window size
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+```
+
+
+```{r preprocessing}
+
+#! ===============================================
+#! A function to mark lures in a sequence
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+#! ===============================================
+#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
+#! a and al are respectively accuracy and recent accuracy
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
+         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
+         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
+         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
+         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), NA, sl),
+         tl = ifelse(is.na(tl), NA, tl),
+         ll = ifelse(is.na(ll), NA, ll),
+         al = ifelse(is.na(al), NA, al),
+         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
+) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
+  # mutate(dp = map_dbl(local_stats,
+  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
+  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
+  # mutate(cr = map_dbl(local_stats,
+  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
+  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
+  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition) %>%
+  unnest(local_stats) %>%
+  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
+  mutate(stimulus = factor(stimulus)) %>%
+  mutate(stimulus_type = factor(stimulus_type))
+
+save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
+
+#! ===============================================
+#! visualize correlations
+#DEBUG inspect_cor(seqs, show_plot = T)
+
+
+```
+
+```{r remove_highly_correlated_predictors}
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+cor_matrix <- cor(seqs[,-1])
+cor_high <- findCorrelation(cor_matrix, 0.8)
+high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+seqs.uncorr <- seqs %>% select(-high_cor_remove)
+```
diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R
new file mode 100644
index 0000000..7da6569
--- /dev/null
+++ b/ccn2019/ccn2019_diagrams.R
@@ -0,0 +1,51 @@
+NB_avg %>%
+  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
+  ggplot(aes(targets, accuracy, color=factor(cluster))) +
+  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+
+NB_avg %>%
+  ggplot(aes(lures, accuracy, color=condition)) +
+  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(skewness, accuracy, color=condition)) +
+  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, accuracy, color=condition)) +
+  ggtitle("lumpiness", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, rts, color=condition)) +
+  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
+  xlab("lumpiness") +
+  ylab("Average RT") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+nback <- NB_modified
+
+nback %>%
+  mutate(block=as.factor(block)) %>%
+  mutate(trial=as.factor(trial)) %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  #filter(condition=='3-back') %>%
+  #mutate(correct=as.numeric(correct)) %>%
+  inspect_cor(show_plot = T)
+
+averaged_nback <- NB_avg
+
+averaged_nback %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  inspect_cor(show_plot = T)
+
diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R
new file mode 100644
index 0000000..9bbe23a
--- /dev/null
+++ b/ccn2019/dummy-vars-playground.R
@@ -0,0 +1,111 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+# seqs %>%
+#   ggplot(aes(x=v,y=a,col=correct)) +
+#   geom_jitter() +
+#   geom_point(alpha=0.1) +
+#   geom_smooth()
+
+f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus
+f <- correct ~ n + t + v + stimulus
+
+set.seed(654321)
+
+# 1. dummy vars
+# INPUTS : seqs
+# OUTPUTS: seqs.dmy
+
+seqs <- seqs %>% 
+  drop_na(rt, correct, tl,sl)
+
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+# seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+  
+
+# ROC for each var
+filterVarImp(as.data.frame(seqs.train.x), seqs.train.y)
+
+# model <- cv.glmnet(seqs.train.x,
+#                    seqs.train.y,
+#                    alpha = 1,
+#                    nfolds = 5,
+#                    family = "binomial",
+#                    type.measure = "auc")
+# 
+# model$lambda.min
+
+ctrl <- trainControl(method="cv",
+                     number=1, 
+                     classProbs=T,
+                     verbose = T,
+#                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+# glmnet tune
+tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100))
+
+max_components <- n_distinct(attr(terms(f),"term.labels"))
+# pls tune
+tune <- expand.grid(ncomp=1:max_components)
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "glmnet",
+               #family = "binomial",
+               #metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               #verboseIter = TRUE,
+               tuneLength = 2,
+               #tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "auc",
+    boot.n = 100,
+    ci = T)
+
+
+
diff --git a/ccn2019/pls_playground.Rmd b/ccn2019/pls_playground.Rmd
new file mode 100644
index 0000000..c566656
--- /dev/null
+++ b/ccn2019/pls_playground.Rmd
@@ -0,0 +1,104 @@
+---
+title: "PLS Training"
+output: html_notebook
+editor_options: 
+  chunk_output_type: console
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+
+best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
\ No newline at end of file
diff --git a/ccn2019_diagrams.R b/ccn2019_diagrams.R
deleted file mode 100644
index 7da6569..0000000
--- a/ccn2019_diagrams.R
+++ /dev/null
@@ -1,51 +0,0 @@
-NB_avg %>%
-  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
-  ggplot(aes(targets, accuracy, color=factor(cluster))) +
-  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-
-NB_avg %>%
-  ggplot(aes(lures, accuracy, color=condition)) +
-  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(skewness, accuracy, color=condition)) +
-  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(lumpiness, accuracy, color=condition)) +
-  ggtitle("lumpiness", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(lumpiness, rts, color=condition)) +
-  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
-  xlab("lumpiness") +
-  ylab("Average RT") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-nback <- NB_modified
-
-nback %>%
-  mutate(block=as.factor(block)) %>%
-  mutate(trial=as.factor(trial)) %>%
-  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
-  #filter(condition=='3-back') %>%
-  #mutate(correct=as.numeric(correct)) %>%
-  inspect_cor(show_plot = T)
-
-averaged_nback <- NB_avg
-
-averaged_nback %>%
-  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
-  inspect_cor(show_plot = T)
-

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd
new file mode 100644
index 0000000..7e19f45
--- /dev/null
+++ b/ccn2019/ccn2019.rev2.Rmd
@@ -0,0 +1,229 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: console
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+#library(plsRglm)
+#library(plsdof)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+
+```
+
+```{r preprocessing}
+
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
+         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
+         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), 0, sl),
+         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
+         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
+         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition)
+
+View()
+inspectdf::inspect_cor(seqs)
+#inspect_cor(NB,show_plot = T)
+```
+
+```{r}
+model1 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a,data=., family = "gaussian")
+aug1 <- augment(model1)
+aug1 %>%
+  ggplot(aes(a,rt)) +
+    geom_point() +
+    geom_smooth(aes(y=.fitted, color='red'))
+```
+
+```{r}
+model2 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
+aug2 <- augment(model2)
+aug2 %>%
+  ggplot(aes(jitter(al),rt)) +
+    geom_point(alpha=0.2,shape=18) +
+    xlab("accuracy") + 
+    geom_smooth(aes(y=.fitted), color='blue') +
+    geom_smooth(aes(y=aug1$.fitted), color='red')
+  
+```
+
+```{r models}
+
+nb_split <- initial_split(NB2, prop = 0.75)
+training_data <- training(nb_split)
+testing_data <- testing(nb_split)
+cv_split <- vfold_cv(training_data, v = 5)
+cv_data <- cv_split %>% 
+  mutate(
+    train = map(splits, ~training(.x)), 
+    validate = map(splits, ~testing(.x))
+  )
+
+cv_models_lm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_glm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_pls_a <- cv_data %>%
+  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
+         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
+  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
+  )
+
+head(cv_models_pls_a)
+
+
+cv_models_pls_a1 <- cv_data[3][[1]]
+
+
+NBx <- NB %>%
+  group_by(participant) %>%
+  summarise(freq = as.data.frame(table(stimulus)))
+
+ggplot(NBx$freq, aes(, group=participant)) +
+  geom_point(se = F)
+
+
+#%>%
+#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
+  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
+  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# Compile cross-validation settings
+
+#any(is.na(NB2))
+#NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+```
diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd
new file mode 100644
index 0000000..ba1b312
--- /dev/null
+++ b/ccn2019/ccn2019.rev3.Rmd
@@ -0,0 +1,100 @@
+
+$P=\langle V,D,C,W \rangle$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+#! ===============================================
+#! load required packages
+
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+library(inspectdf)
+library(caTools)
+library(pROC)
+
+#! ===============================================
+#! load data set and set running window size
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+```
+
+
+```{r preprocessing}
+
+#! ===============================================
+#! A function to mark lures in a sequence
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+#! ===============================================
+#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
+#! a and al are respectively accuracy and recent accuracy
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
+         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
+         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
+         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
+         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), NA, sl),
+         tl = ifelse(is.na(tl), NA, tl),
+         ll = ifelse(is.na(ll), NA, ll),
+         al = ifelse(is.na(al), NA, al),
+         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
+) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
+  # mutate(dp = map_dbl(local_stats,
+  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
+  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
+  # mutate(cr = map_dbl(local_stats,
+  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
+  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
+  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition) %>%
+  unnest(local_stats) %>%
+  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
+  mutate(stimulus = factor(stimulus)) %>%
+  mutate(stimulus_type = factor(stimulus_type))
+
+save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
+
+#! ===============================================
+#! visualize correlations
+#DEBUG inspect_cor(seqs, show_plot = T)
+
+
+```
+
+```{r remove_highly_correlated_predictors}
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+cor_matrix <- cor(seqs[,-1])
+cor_high <- findCorrelation(cor_matrix, 0.8)
+high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+seqs.uncorr <- seqs %>% select(-high_cor_remove)
+```
diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R
new file mode 100644
index 0000000..7da6569
--- /dev/null
+++ b/ccn2019/ccn2019_diagrams.R
@@ -0,0 +1,51 @@
+NB_avg %>%
+  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
+  ggplot(aes(targets, accuracy, color=factor(cluster))) +
+  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+
+NB_avg %>%
+  ggplot(aes(lures, accuracy, color=condition)) +
+  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(skewness, accuracy, color=condition)) +
+  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, accuracy, color=condition)) +
+  ggtitle("lumpiness", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, rts, color=condition)) +
+  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
+  xlab("lumpiness") +
+  ylab("Average RT") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+nback <- NB_modified
+
+nback %>%
+  mutate(block=as.factor(block)) %>%
+  mutate(trial=as.factor(trial)) %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  #filter(condition=='3-back') %>%
+  #mutate(correct=as.numeric(correct)) %>%
+  inspect_cor(show_plot = T)
+
+averaged_nback <- NB_avg
+
+averaged_nback %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  inspect_cor(show_plot = T)
+
diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R
new file mode 100644
index 0000000..9bbe23a
--- /dev/null
+++ b/ccn2019/dummy-vars-playground.R
@@ -0,0 +1,111 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+# seqs %>%
+#   ggplot(aes(x=v,y=a,col=correct)) +
+#   geom_jitter() +
+#   geom_point(alpha=0.1) +
+#   geom_smooth()
+
+f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus
+f <- correct ~ n + t + v + stimulus
+
+set.seed(654321)
+
+# 1. dummy vars
+# INPUTS : seqs
+# OUTPUTS: seqs.dmy
+
+seqs <- seqs %>% 
+  drop_na(rt, correct, tl,sl)
+
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+# seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+  
+
+# ROC for each var
+filterVarImp(as.data.frame(seqs.train.x), seqs.train.y)
+
+# model <- cv.glmnet(seqs.train.x,
+#                    seqs.train.y,
+#                    alpha = 1,
+#                    nfolds = 5,
+#                    family = "binomial",
+#                    type.measure = "auc")
+# 
+# model$lambda.min
+
+ctrl <- trainControl(method="cv",
+                     number=1, 
+                     classProbs=T,
+                     verbose = T,
+#                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+# glmnet tune
+tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100))
+
+max_components <- n_distinct(attr(terms(f),"term.labels"))
+# pls tune
+tune <- expand.grid(ncomp=1:max_components)
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "glmnet",
+               #family = "binomial",
+               #metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               #verboseIter = TRUE,
+               tuneLength = 2,
+               #tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "auc",
+    boot.n = 100,
+    ci = T)
+
+
+
diff --git a/ccn2019/pls_playground.Rmd b/ccn2019/pls_playground.Rmd
new file mode 100644
index 0000000..c566656
--- /dev/null
+++ b/ccn2019/pls_playground.Rmd
@@ -0,0 +1,104 @@
+---
+title: "PLS Training"
+output: html_notebook
+editor_options: 
+  chunk_output_type: console
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+
+best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
\ No newline at end of file
diff --git a/ccn2019_diagrams.R b/ccn2019_diagrams.R
deleted file mode 100644
index 7da6569..0000000
--- a/ccn2019_diagrams.R
+++ /dev/null
@@ -1,51 +0,0 @@
-NB_avg %>%
-  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
-  ggplot(aes(targets, accuracy, color=factor(cluster))) +
-  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-
-NB_avg %>%
-  ggplot(aes(lures, accuracy, color=condition)) +
-  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(skewness, accuracy, color=condition)) +
-  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(lumpiness, accuracy, color=condition)) +
-  ggtitle("lumpiness", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(lumpiness, rts, color=condition)) +
-  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
-  xlab("lumpiness") +
-  ylab("Average RT") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-nback <- NB_modified
-
-nback %>%
-  mutate(block=as.factor(block)) %>%
-  mutate(trial=as.factor(trial)) %>%
-  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
-  #filter(condition=='3-back') %>%
-  #mutate(correct=as.numeric(correct)) %>%
-  inspect_cor(show_plot = T)
-
-averaged_nback <- NB_avg
-
-averaged_nback %>%
-  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
-  inspect_cor(show_plot = T)
-
diff --git a/dummy-vars-playground.R b/dummy-vars-playground.R
deleted file mode 100644
index d79b629..0000000
--- a/dummy-vars-playground.R
+++ /dev/null
@@ -1,114 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-# seqs %>%
-#   ggplot(aes(x=v,y=a,col=correct)) +
-#   geom_jitter() +
-#   geom_point(alpha=0.1) +
-#   geom_smooth()
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-# 1. dummy vars
-# INPUTS : seqs
-# OUTPUTS: seqs.dmy
-
-seqs <- seqs %>% 
-  drop_na(rt, correct, tl,sl)
-
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-# seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-  
-
-# ROC for each var
-filterVarImp(as.data.frame(seqs.train.x), seqs.train.y)
-
-# model <- cv.glmnet(seqs.train.x,
-#                    seqs.train.y,
-#                    alpha = 1,
-#                    nfolds = 5,
-#                    family = "binomial",
-#                    type.measure = "auc")
-# 
-# model$lambda.min
-
-ctrl <- trainControl(method="cv",
-                     number=5, 
-                     classProbs=T,
-                     verbose = T,
-#                     sampling = "up",
-                     savePredictions = T,
-                     summaryFunction=twoClassSummary)
-
-# glmnet tune
-tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100))
-
-max_components <- n_distinct(attr(terms(f),"term.labels"))
-# pls tune
-tune <- expand.grid(ncomp=1:max_components)
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "pls",
-               family = "binomial",
-               metric = "ROC",
-               preProc = c("center", "scale"),
-               verboseIter = TRUE,
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-

diff --git a/ccn2019-accuracy.R b/ccn2019-accuracy.R
deleted file mode 100644
index adfb442..0000000
--- a/ccn2019-accuracy.R
+++ /dev/null
@@ -1,108 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                              data = train_data,
-                              seed = 1)$data
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T
-)
-
-pls.new_model <- train(
-  a ~ .-al-dp-cr-rt-correct,
-  data = train_data.imbalanced,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
-
-
-pls.old_model <- train(
-  a ~ t + n + v,
-  data = train_data,
-  method = "pls",
-  preProcess = c("center","scale"),
-  trControl = control
-)
-
-
-pls.old_model
-pls.new_model
-plot(varImp(pls.old_model))
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.old_model, pch = "|")
-
-resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
-summary(resamps)
-dotplot(resamps, metric = "Rsquared")
-difValues <- diff(resamps)
-bwplot(difValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
-
-
-summary(pls.new_model)
-
-
-# SSE and RMSE
-
-SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
-SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.new_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.new_predicted))
-
-
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-R_square <- 1 - SSE/SST
-SSE <- sum((test_data$a - pls.old_predicted)^2)
-RMSE <- sqrt(SSE/length(pls.old_predicted))
-
-
-as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=lm,se=F) +
-    ggtitle("Accuracy: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-correct.R b/ccn2019-correct.R
deleted file mode 100644
index 9b9d9fb..0000000
--- a/ccn2019-correct.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#==================================================#
-# model the "correct" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(pls)
-
-#devtools::install_github("sachsmc/plotROC")
-library(plotROC)
-
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>% 
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-inspect_cat(seqs.imputed)
-inspect_num(seqs.imputed)
-
-seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  classProbs = T,
-  verboseIter = T,
-  savePredictions = T,
-  sampling = "down",
-  selectionFunction = "oneSE"
-)
-
-pls.new_model <- train(
-  correct ~ .-a-al-dp-cr-rt,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
-
-pls.common_model <- train(
-  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
-  data = train_data,
-  method = "pls",
-  metric = "Accuracy",
-  tuneLength = 20,
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.common_model)
-plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
-
-
-#trellis.par.set(caretTheme())
-#densityplot(pls.new_model, pch = "|")
-#densityplot(pls.common_model, pch = "|")
-
-# Compile models and compare performance
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-#DEBUG summary(pls.models)
-#DEBUG dotplot(pls.models)
-#DEBUG diffValues <- diff(resamps)
-bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
-
-
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
-
-confusionMatrix(pls.new_predicted, test_data$correct)
-confusionMatrix(pls.common_predicted, test_data$correct)
-
-library(pROC)
-par(pty="s")
-
-roc(test_data$correct,
-    pls.common_predicted_prob$CORRECT,
-    plot = T,
-    legacy.axes=T,
-    lwd=2,
-    col="darkgrey",
-    lty = 3,
-    print.auc = T,
-    print.auc.y = 45,
-    print.auc.x = 55,
-    percent = T,
-    ci = T,
-    boot.n = 100
-    )
-
-
-
-# roc_test_indices <- createDataPartition(test_data$correct,
-#                                      times = 10,
-#                                      p = 0.9,
-#                                      list = F)
-
-#for (i in 1:ncol(roc_test_indices)) {
-#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
-#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
-  
-#  plot.roc(test_sample_correct,
-#           predprob_sample_correct,
-roc(test_data$correct,
-    pls.new_predicted_prob$CORRECT,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    add=T,
-    of = "se",
-    boot.n = 100,
-    ci = T)
-
-#}
-
-legend(100,100, legend=c("New Model", "Common Model"),
-       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
-
-# requires plotROC package
-#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
-#DEBUG   geom_roc()
diff --git a/ccn2019-criterion.R b/ccn2019-criterion.R
deleted file mode 100644
index 5bcefba..0000000
--- a/ccn2019-criterion.R
+++ /dev/null
@@ -1,119 +0,0 @@
-#==================================================#
-# model the "accuract" column (a for global, and al for local accuracy)
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  cr ~ .-a-al-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-
-ggplot(varImp(pls.new_model)) +
-  labs(title="Criterion - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-elasticnet.R b/ccn2019-elasticnet.R
deleted file mode 100644
index 7eea931..0000000
--- a/ccn2019-elasticnet.R
+++ /dev/null
@@ -1,52 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-set.seed(10)
-
-ctrl <- trainControl(
-  method = "cv",
-  number = 10,
-  classProbs = T,
-  summaryFunction = twoClassSummary
-)
-
-grid <- expand.grid(
-  alpha = seq(0,1,length=10),
-  lambda = seq(0.0001, 0.1, length=10)
-)
-
-model <- train(seqs.train.x, seqs.train.y,
-               method = "glmnet",
-               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
-               tuneGrid = grid,
-               trControl = ctrl)
-
-model
-plot(varImp(model, useModel = F))
-plot(model)
-max(model$results$ROC)
diff --git a/ccn2019-featsel.R b/ccn2019-featsel.R
deleted file mode 100644
index ae5dc98..0000000
--- a/ccn2019-featsel.R
+++ /dev/null
@@ -1,38 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-seqs <- seqs %>% drop_na(rt, correct, tl,sl)
-
-f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-
-ctrl <- rfeControl(functions = lrFuncs,
-                   method = "cv",
-                   number = 10,
-                   verbose = T)
-
-rmProfile <- rfe(seqs.train.x, seqs.train.y,
-                 preProcess = c("nzv"),
-                 rfeControl = ctrl)
-
-rmProfile
diff --git a/ccn2019-rt.R b/ccn2019-rt.R
deleted file mode 100644
index 4f6defb..0000000
--- a/ccn2019-rt.R
+++ /dev/null
@@ -1,117 +0,0 @@
-#==================================================#
-# model the "RT" column
-
-library(here)
-library(tidyverse)
-library(caret)
-library(inspectdf)
-library(skimr)
-library(ROSE)
-
-load(here("notebooks/data/nback_seqs.Rd"))
-
-set.seed(42)
-
-seqs.imputed <- seqs %>%
-  filter(!is.na(correct), !is.na(rt)) %>%
-  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
-
-#DEBUG inspect_num(seqs.imputed)
-
-#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
-
-
-#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
-
-train_indexes <- createDataPartition(seqs.imputed$correct,
-                                     times = 1,
-                                     p = 0.7,
-                                     list = F)
-train_data <- seqs.imputed[train_indexes,]
-test_data <- seqs.imputed[-train_indexes,]
-
-train_data.imbalanced <- ROSE(correct ~ .,
-                            data = train_data,
-                            seed = 1)$data
-
-# VIsualize split
-train_data.imbalanced$grp <- "train"
-test_data$grp <- "test"
-
-rbind(train_data.imbalanced, test_data) %>% 
-  ggplot(aes(x=correct, fill=grp)) +
-  geom_histogram(stat="count", position='dodge') +
-  labs(title="Imbalanced Split")
-
-control <- trainControl(
-  method = "repeatedcv",
-  number = 5,
-  repeats = 2,
-  verboseIter = T,
-  savePredictions = T
-)
-
-train_data <- train_data.imbalanced %>% select(-grp)
-
-pls.new_model <- train(
-  rt ~ .-a-al-cr-dp-rt-correct,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-plot(pls.new_model)
-summary(pls.new_model)
-plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
-
-pls.common_model <- train(
-  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
-  data = train_data,
-  method = "pls",
-  preProcess = c("zv","center","scale"),
-  trControl = control
-)
-
-summary(pls.common_model)
-plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
-
-trellis.par.set(caretTheme())
-densityplot(pls.new_model, pch = "|")
-densityplot(pls.common_model, pch = "|")
-
-pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
-summary(pls.models)
-dotplot(pls.models, metric = "Rsquared")
-diffValues <- diff(pls.models)
-bwplot(diffValues, layout=c(1,3))
-
-
-pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
-pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
-pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
-pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
-
-# SSE and RMSE
-# 
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
-# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.new_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.new_predicted))
-# 
-# 
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# R_square <- 1 - SSE/SST
-# SSE <- sum((test_data$cr - pls.common_predicted)^2)
-# RMSE <- sqrt(SSE/length(pls.common_predicted))
-# 
-
-as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
-  ggplot(aes(predicted, observed)) +
-    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
-    geom_point(alpha = 0.1,shape=16) + 
-    geom_smooth(method=glm) +
-    ggtitle("Criterion: Predicted vs Actual") +
-    xlab("Predecited") +
-    ylab("Observed")
diff --git a/ccn2019-svm.R b/ccn2019-svm.R
deleted file mode 100644
index a1590a5..0000000
--- a/ccn2019-svm.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-seqs <- seqs %>%drop_na(rt, correct, tl,sl)
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-
-ctrl <- trainControl(method="cv",
-                     number=10,
-#                     repeats = 1, #repeatedcv
-                     sampling = "up",
-                     savePredictions = T,
-                     verbose = T)
-
-tune <- expand.grid(C = seq(0,5,by=0.25))
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "svmLinear",
-               preProc = c("center", "scale"),
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/ccn2019.rev0.Rmd b/ccn2019.rev0.Rmd
deleted file mode 100644
index 9220ff4..0000000
--- a/ccn2019.rev0.Rmd
+++ /dev/null
@@ -1,581 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: inline
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(plsRglm)
-library(plsdof)
-library(caret)
-```
-
-```{r params}
-load('./data/CL2015.RData')
-
-window_size <- 8
-```
-
-
-
-```{r history}
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-with_history <- function(stimuli, length=16, fixed=F) {
-  seq <- paste(stimuli, collapse = '')
-  
-  sapply(1:length(stimuli), function(i) {
-    stri_reverse(str_sub(seq, max(1,i-length+1), i))
-  })
-  #ifelse(fixed, h[str_length(h)==size], h)
-}
-
-# $x_{s,local}$
-with_skewness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    sum(sort(freqs, decreasing = T)[1:2]) - 1
-  })
-}
-
-# $x_{u,local}$
-with_lumpiness <- function(history) {
-  sapply(history, function(h) {
-    freqs <- table(unlist(str_split(h,"")))
-    max(freqs) - 1
-  })
-}
-
-
-with_targets_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="target"]) / length(trials)
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, length=16) {
-  sapply(1:length(stimulus_type), function(i) {
-    trials <- stimulus_type[max(1,i-length):i]
-    length(trials[trials=="lure"]) / length(trials)
-  })
-}
-
-#TODO change to list column workflow with broom for model fitting and evaluating the fits
-# duh! we are using list nested insided a tibble, so put all new columns in a new list column
-# instead of adding a new column for each.
-NB2 <- NB %>%
-  group_by(participant, condition, block) %>%
-  nest() %>% unnest(data) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(history = with_history(stimulus, window_size)) %>%
-  mutate(x_sl = with_skewness(history)) %>%
-  mutate(x_ul = with_lumpiness(history)) %>%
-  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
-  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
-  ungroup()
-  
-pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
-NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# caret
-library(caret)
-# Compile cross-validation settings
-
-
-any(is.na(NB2))
-NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-
-plsResult
-```
-
-<!--chapter:end:ccn2019.rev2.Rmd-->
-
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
-
-<!--chapter:end:ccn2019.Rmd-->
-
----
-title: "PLS Training"
-output: html_notebook
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```{r plsrglm}
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
-
-<!--chapter:end:pls_playground.Rmd-->
-
diff --git a/ccn2019.rev1.Rmd b/ccn2019.rev1.Rmd
deleted file mode 100644
index 9074227..0000000
--- a/ccn2019.rev1.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
----
-title: "Evaluating N-Back Sequences"
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-library(tidyverse)
-library(ggplot2)
-library(stringi)
-library(GA)
-library(dbscan)
-library(inspectdf)
-
-load('./data/CL2015.RData')
-```
-
-### Variables
-- $T$ number of targets
-- $L$ number of lures
-- $S$ Skewness score
-- $U$ Uniformity (!repetition)
-- $RT_{mean}$
-- $Accuracy_{mean}$
-- $dprime$
-- $criterion$
-
-## Constraints
-
-- fixed number of targets
-- fixed number of lures (a.k.a, foils)
-- uniform distribution of choices
-- controlled local lumpiness
-
-
-Each constraint is an up side down quadratic function to be minimized.
-
-```{r, eval=F}
-targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
-skewness_cost <- function(x, choices) {
-  uniform_ratio <- length(x) / length(choices)
-  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
-  for (c in choices) {
-    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
-  }
-  #TODO convert to gaussian loss
-  max(deviation_from_uniform)
-}
-
-lumpiness_cost <- function(x, choices) {
-  #trials = len(seq)
-  #freqs = [float(seq.count(c)) for c in choices]
-  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
-  #return ralph_skewed
-  NA
-}
-
-#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
-#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
-#plot(GA)
-```
-
-
-```{r}
-
-with_lures <- function(condition, stim, stim_type, history = NA) {
-  sapply(1:length(stim), 
-    function(i) {
-      switch(as.character(condition[i]),
-             "2-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )},
-             "3-back" = {
-               ifelse(
-                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
-                 'lure', 
-                 as.character(stim_type[i])
-               )}
-      )
-
-    })
-}
-
-with_targets_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-stri_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="target"])
-  })
-}
-
-with_lures_ratio <- function(stimulus_type, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus_type[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    length(trials[trials=="lure"])
-  })
-}
-
-with_lumpiness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    max(table(trials)) - 1
-  })
-}
-
-with_lag <- function(stimulus, history) {
-  # find last occurance the of stimulus
-}
-
-with_skewness_score <- function(stimulus, history) {
-  sapply(1:length(history), function(i) {
-    trials <- stimulus[(i-str_length(history[i])):i]
-    trials <- unlist(trials, use.names=FALSE)
-    sum(sort(table(trials), decreasing = T)[1:2]) - 1
-  })
-}
-
-with_history <- function(stims, size=16) {
-  res <- c('')
-  for (i in 2:length(stims)) {
-    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
-  }
-  #res <- ifelse(stri_length(res)==size, res, NA)
-  res
-}
-
-normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
-  #TODO
-  sapply(1:length(targets_ratio), function(i) 0)
-}
-
-window_size <- 8
-
-NB_modified <- NB %>%
-  group_by(participant, condition, block) %>%
-  mutate(history = with_history(stimulus, size=window_size)) %>%
-  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
-  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
-  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
-  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
-  mutate(skewness = with_skewness_score(stimulus, history)) %>%
-  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
-  filter(stri_length(history)==window_size) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
-  ungroup()
-
-pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
-
-NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-
-## participant-level averaged NB, a single row represent an observation for a single subject
-## in a single condition
-NB_avg <- NB_modified %>%
-  group_by(participant, condition) %>%
-  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
-  summarise(
-    targets=sum(targets), 
-    lures=sum(lures), 
-    skewness=sum(skewness), 
-    lumpiness=sum(lumpiness), 
-    rt = mean(rt, na.rm=T), 
-    correct=sum(correct,na.rm=T)/90) %>%
-  ungroup()
-
-# print
-# NB_modified %>%
-#   filter(participant=='P1') %>%
-#   View()
-#   
-  
-
-fit <- lm(correct ~ t * s * u * l * d, NB_modified)
-
-```
-
-
-```{r}
-
-# DBSCAN Clustering (RT+ACCURACY against skewness)
-NB_avg <- NB_avg %>%
-  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
-
-NB_avg %>%
-  ggplot(aes(skewness, correct, color=factor(cluster))) +
-  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-```
-
-
-```{r}
-## single-subject figures
-NB_modified %>%
-  ggplot(aes(t,s,color=correct)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  inspect_cor(show_plot = T)
-
-NB_avg %>%
-  inspect_cor(show_plot = T)
-
-NB_modified %>%
-  ggplot(aes(rt,correct,color=u)) +
-  geom_jitter() +
-  geom_point() +
-  stat_summary(fun.y="mean")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(t),color=correct)) +
-    geom_jitter() +
-    geom_point(alpha=0.1)
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(s),jitter(u),color=correct)) +
-    geom_jitter() +
-    geom_point() +
-    facet_wrap(~condition)
-
-# rt/accuracy and lures
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
-    geom_jitter() +
-    geom_point(shape=16) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,pc2,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,rt,color=correct)) +
-    geom_point(alpha=0.3) +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-NB_modified %>%
-  filter(!is.na(correct)) %>%
-  ggplot(aes(pc1,correct,color=correct)) +
-    geom_point() +
-    geom_smooth(method="lm",se = F) + 
-    facet_wrap(~condition, scales="free")
-
-
-```
-
-## TODO
-  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
-  - constraint3=fitness(history))
-  - kmeans(NB)
-  - ggplot(kmeans_clusters$accuracy) 
-  - ggplot(kmeans_clusters$rt)
-  - 
-
-
-
-
-```{python}
-a=2
-p
-```
diff --git a/ccn2019.rev2.Rmd b/ccn2019.rev2.Rmd
deleted file mode 100644
index 7e19f45..0000000
--- a/ccn2019.rev2.Rmd
+++ /dev/null
@@ -1,229 +0,0 @@
----
-title: "Statistical Properties of the N-Back Sequences"
-output:
-  html_notebook: default
-  pdf_document: default
-editor_options: 
-  chunk_output_type: console
----
-
-# Problems
-
-Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
-
-# Gaps
-
-- Unclear how to parameterize interesting variations for sequence generation
-- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
-    - local vs. global properties, which one matters the most?
-    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
-    - Global:  No lures, large vocabulary -> pattern repeats implies a target
-
-
-## Formulating Generating the N-Back Sequences as a CSP instance
-
-$P=\langle V,D,C,W\rangle$
-
-$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
-
-$D=\{\}$
-
-
-Constraints:
-
-$$
-\\
-
-x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
-
-\\\\
-
-x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
-
-\\\\
-
-x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
-
-\\\\
-
-x_{l} = L \times trials
-\\\\
-
-x_{ll} = L \times w
-\\\\
-
-x_{v} = |V|
-\\
-
-x_{ul} = w
-\\\\
-
-x_{s} = {trials \over |V|}
-\\\\
-
-x_{sl} = max(1, {w \over |V|})
-\\\\
-
-x_{g} = {trials \over w}
-
-\\\\
-
-x_{vl} = min(|V|, w)
-$$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-#library(plsRglm)
-#library(plsdof)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-
-```
-
-```{r preprocessing}
-
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
-         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
-         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), 0, sl),
-         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
-         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
-         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition)
-
-View()
-inspectdf::inspect_cor(seqs)
-#inspect_cor(NB,show_plot = T)
-```
-
-```{r}
-model1 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a,data=., family = "gaussian")
-aug1 <- augment(model1)
-aug1 %>%
-  ggplot(aes(a,rt)) +
-    geom_point() +
-    geom_smooth(aes(y=.fitted, color='red'))
-```
-
-```{r}
-model2 <- NB2 %>%
-  select(-participant, -stimulus) %>%
-  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
-aug2 <- augment(model2)
-aug2 %>%
-  ggplot(aes(jitter(al),rt)) +
-    geom_point(alpha=0.2,shape=18) +
-    xlab("accuracy") + 
-    geom_smooth(aes(y=.fitted), color='blue') +
-    geom_smooth(aes(y=aug1$.fitted), color='red')
-  
-```
-
-```{r models}
-
-nb_split <- initial_split(NB2, prop = 0.75)
-training_data <- training(nb_split)
-testing_data <- testing(nb_split)
-cv_split <- vfold_cv(training_data, v = 5)
-cv_data <- cv_split %>% 
-  mutate(
-    train = map(splits, ~training(.x)), 
-    validate = map(splits, ~testing(.x))
-  )
-
-cv_models_lm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_glm_a <- cv_data %>% 
-  mutate(model = map(train, ~lm(formula = a~., data = .x)),
-         tidied = map(model, tidy),
-         glanced = map(model, glance),
-         augment = map(model, augment))
-
-cv_models_pls_a <- cv_data %>%
-  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
-         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
-  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
-  )
-
-head(cv_models_pls_a)
-
-
-cv_models_pls_a1 <- cv_data[3][[1]]
-
-
-NBx <- NB %>%
-  group_by(participant) %>%
-  summarise(freq = as.data.frame(table(stimulus)))
-
-ggplot(NBx$freq, aes(, group=participant)) +
-  geom_point(se = F)
-
-
-#%>%
-#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
-  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
-  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
-
-# Compile cross-validation settings
-
-#any(is.na(NB2))
-#NB2 <- na.omit(NB2)
-
-# set.seed(100)
-# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
-# 
-# # PLS
-# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
-#  method = "pls",
-#  metric = "Accuracy",
-#  tuneLength = 20,
-#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
-#  preProc = c("zv","center","scale"))
-#  
-# # Check CV
-# plot(mod1)
-
-
-#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
-#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
-
-```
diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd
deleted file mode 100644
index ba1b312..0000000
--- a/ccn2019.rev3.Rmd
+++ /dev/null
@@ -1,100 +0,0 @@
-
-$P=\langle V,D,C,W \rangle$
-
-```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
-#! ===============================================
-#! load required packages
-
-library(ggplot2)
-library(tidyverse)
-library(stringi)
-library(pls)
-library(caret)
-library(here)
-library(tsibble)
-library(broom)
-library(rsample)
-library(inspectdf)
-library(caTools)
-library(pROC)
-
-#! ===============================================
-#! load data set and set running window size
-load(here('notebooks/data/CL2015.RData'))
-window_size <- 8
-
-```
-
-
-```{r preprocessing}
-
-#! ===============================================
-#! A function to mark lures in a sequence
-with_lures <- function(stimulus, stimulus_type, n) {
-  sapply(1:length(stimulus), function(i) {
-    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
-    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
-    ifelse(are_valid_trials && stimulus[i] %in% lures,
-           "lure", 
-           as.character(stimulus_type[i]))
-  })
-}
-
-#! ===============================================
-#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
-#! a and al are respectively accuracy and recent accuracy
-seqs <- NB %>%
-  group_by(participant, block, condition) %>%
-  mutate(n = ifelse(condition=='2-back',2,3)) %>%
-  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
-  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
-         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
-         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
-         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
-         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
-         sl = ifelse(is.na(sl), NA, sl),
-         tl = ifelse(is.na(tl), NA, tl),
-         ll = ifelse(is.na(ll), NA, ll),
-         al = ifelse(is.na(al), NA, al),
-         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
-) %>%
-  nest(.key='local_stats') %>%
-  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
-  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
-  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
-  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
-  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
-  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
-  # mutate(dp = map_dbl(local_stats,
-  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
-  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
-  # mutate(cr = map_dbl(local_stats,
-  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
-  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
-  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
-  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
-  ungroup() %>%
-  select(-participant,-block,-condition) %>%
-  unnest(local_stats) %>%
-  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
-  mutate(stimulus = factor(stimulus)) %>%
-  mutate(stimulus_type = factor(stimulus_type))
-
-save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
-
-#! ===============================================
-#! visualize correlations
-#DEBUG inspect_cor(seqs, show_plot = T)
-
-
-```
-
-```{r remove_highly_correlated_predictors}
-# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
-cor_matrix <- cor(seqs[,-1])
-cor_high <- findCorrelation(cor_matrix, 0.8)
-high_cor_remove <- row.names(cor_matrix)[cor_high]
-#FIXME remove by column name
-seqs.uncorr <- seqs %>% select(-high_cor_remove)
-```
diff --git a/ccn2019/ccn2019-accuracy.R b/ccn2019/ccn2019-accuracy.R
new file mode 100644
index 0000000..adfb442
--- /dev/null
+++ b/ccn2019/ccn2019-accuracy.R
@@ -0,0 +1,108 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                              data = train_data,
+                              seed = 1)$data
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T
+)
+
+pls.new_model <- train(
+  a ~ .-al-dp-cr-rt-correct,
+  data = train_data.imbalanced,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+plot(varImp(pls.new_model), main="Variable Importance for Accuracy")
+
+
+pls.old_model <- train(
+  a ~ t + n + v,
+  data = train_data,
+  method = "pls",
+  preProcess = c("center","scale"),
+  trControl = control
+)
+
+
+pls.old_model
+pls.new_model
+plot(varImp(pls.old_model))
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.old_model, pch = "|")
+
+resamps <- resamples(list(old = pls.old_model, new = pls.new_model))
+summary(resamps)
+dotplot(resamps, metric = "Rsquared")
+difValues <- diff(resamps)
+bwplot(difValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.old_train_predicted <- predict(pls.old_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.old_predicted <- predict(pls.old_model, test_data, type="raw")
+
+
+summary(pls.new_model)
+
+
+# SSE and RMSE
+
+SSE <- sum((test_data$a - pls.new_predicted)^2)    # sum of squared errors
+SST <- sum((test_data$a - mean(train_data$a))^2) # total sum of squares, remember to use training data here
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.new_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.new_predicted))
+
+
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+R_square <- 1 - SSE/SST
+SSE <- sum((test_data$a - pls.old_predicted)^2)
+RMSE <- sqrt(SSE/length(pls.old_predicted))
+
+
+as.data.frame(cbind(predicted = pls.old_predicted, observed = test_data$a)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(20, 30), ylim = c(20, 30)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=lm,se=F) +
+    ggtitle("Accuracy: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-correct.R b/ccn2019/ccn2019-correct.R
new file mode 100644
index 0000000..9b9d9fb
--- /dev/null
+++ b/ccn2019/ccn2019-correct.R
@@ -0,0 +1,148 @@
+#==================================================#
+# model the "correct" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(pls)
+
+#devtools::install_github("sachsmc/plotROC")
+library(plotROC)
+
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>% 
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+inspect_cat(seqs.imputed)
+inspect_num(seqs.imputed)
+
+seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  classProbs = T,
+  verboseIter = T,
+  savePredictions = T,
+  sampling = "down",
+  selectionFunction = "oneSE"
+)
+
+pls.new_model <- train(
+  correct ~ .-a-al-dp-cr-rt,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+plot(varImp(pls.new_model), main="Variables Importance for Correctness (New Model)")
+
+pls.common_model <- train(
+  correct ~ .-a-al-dp-cr-rt-tl-ul-sl-s-ll-vl-l,
+  data = train_data,
+  method = "pls",
+  metric = "Accuracy",
+  tuneLength = 20,
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.common_model)
+plot(varImp(pls.common_model), main="Variable Importance for Correctness (Common Model)")
+
+
+#trellis.par.set(caretTheme())
+#densityplot(pls.new_model, pch = "|")
+#densityplot(pls.common_model, pch = "|")
+
+# Compile models and compare performance
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+#DEBUG summary(pls.models)
+#DEBUG dotplot(pls.models)
+#DEBUG diffValues <- diff(resamps)
+bwplot(pls.models, metric = "Accuracy", layout=c(1,1), main="Correctness Model Performance")
+
+
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.new_predicted_prob <- predict(pls.new_model, test_data, type="prob")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+pls.common_predicted_prob <- predict(pls.common_model, test_data, type="prob")
+
+confusionMatrix(pls.new_predicted, test_data$correct)
+confusionMatrix(pls.common_predicted, test_data$correct)
+
+library(pROC)
+par(pty="s")
+
+roc(test_data$correct,
+    pls.common_predicted_prob$CORRECT,
+    plot = T,
+    legacy.axes=T,
+    lwd=2,
+    col="darkgrey",
+    lty = 3,
+    print.auc = T,
+    print.auc.y = 45,
+    print.auc.x = 55,
+    percent = T,
+    ci = T,
+    boot.n = 100
+    )
+
+
+
+# roc_test_indices <- createDataPartition(test_data$correct,
+#                                      times = 10,
+#                                      p = 0.9,
+#                                      list = F)
+
+#for (i in 1:ncol(roc_test_indices)) {
+#  test_sample_correct <- test_data[roc_test_indices[,i],]$correct
+#  predprob_sample_correct <- pls.new_predicted_prob[roc_test_indices[,i],]$CORRECT
+  
+#  plot.roc(test_sample_correct,
+#           predprob_sample_correct,
+roc(test_data$correct,
+    pls.new_predicted_prob$CORRECT,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    add=T,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
+#}
+
+legend(100,100, legend=c("New Model", "Common Model"),
+       col=c("black", "darkgray"), lty=c(1,1),lwd=2, cex=0.9)
+
+# requires plotROC package
+#DEBUG ggplot(pls.common_model, aes(d = pred$obs, m = pred$CORRECT)) +
+#DEBUG   geom_roc()
diff --git a/ccn2019/ccn2019-criterion.R b/ccn2019/ccn2019-criterion.R
new file mode 100644
index 0000000..5bcefba
--- /dev/null
+++ b/ccn2019/ccn2019-criterion.R
@@ -0,0 +1,119 @@
+#==================================================#
+# model the "accuract" column (a for global, and al for local accuracy)
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  cr ~ .-a-al-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+
+ggplot(varImp(pls.new_model)) +
+  labs(title="Criterion - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R
new file mode 100644
index 0000000..421a43f
--- /dev/null
+++ b/ccn2019/ccn2019-feature-selection.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+#f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+#if (toString(f[[2]]) == "correct")
+#  seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+#upSamples <- upSample(seqs.train.x, seqs.train[["stimulus_type"]])
+
+
+# upSamples <- upSample(seqs.train.x, seqs.train[,"stimulus_type"])
+# seqs.train.x <- upSamples %>% dplyr::select(-correct)
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+ctrl <- trainControl(method="cv",
+                     number = 3,
+                     verbose = T)
+
+model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl)
+#model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   verbose = T)
+
+rmProfile <- rfe(seqs.train.x, seqs.train.y,
+                 rfeControl = ctrl)
+
+summary(model)
+rmProfile
diff --git a/ccn2019/ccn2019-penalized-auc.R b/ccn2019/ccn2019-penalized-auc.R
new file mode 100644
index 0000000..f757596
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-auc.R
@@ -0,0 +1,83 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+library(ppls)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- correct ~ n + t + stimulus_type
+f <- correct ~ n + tl + vl + sl + s + stimulus_type
+
+# predictors selected with stepAIC
+f <- correct ~ n + tl + t + l + stimulus_type
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(f, data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+
+set.seed(10)
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10,
+  classProbs = T,
+  summaryFunction = twoClassSummary,
+  sampling = "down"
+)
+
+grid <- expand.grid(
+  alpha = seq(0,1,length=10),
+  lambda = seq(0.0001, 0.1, length=10)
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "glmnet",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               tuneGrid = grid,
+               metric = "ROC",
+               trControl = ctrl)
+
+seqs.test.y_prob <- predict(model, seqs.test.x, type="prob")
+
+model
+plot(varImp(model, useModel = F))
+plot(model)
+max(model$results$ROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 2000,
+    ci = T)
+
+
+
+# PPLS
+#penalized.pls.cv(seqs.train.y, seqs.train.x, kernel = T, scale=T)
+
diff --git a/ccn2019/ccn2019-penalized-rt.R b/ccn2019/ccn2019-penalized-rt.R
new file mode 100644
index 0000000..73e2fd3
--- /dev/null
+++ b/ccn2019/ccn2019-penalized-rt.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+library(pROC)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+seqs <- seqs %>% drop_na(rt, correct, tl,sl)
+
+f <- rt ~ n + t + v
+
+f <- rt ~ n + tl + v + s + l
+
+set.seed(654321)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+#seqs.train <- ROSE(correct~., data = seqs.train.balanced, N=100)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(
+  method = "cv",
+  number = 10
+)
+
+grid <- expand.grid(
+  ncomp = 1:3
+)
+
+model <- train(seqs.train.x, seqs.train.y,
+               method = "pls",
+               preProcess = c("nzv","center","scale"), # or c("knnImpute"),
+               #tuneGrid = grid,
+               tuneLength = 10,
+               metric = "RMSE",
+               trControl = ctrl)
+
+seqs.test.predicted_y <- predict(model, seqs.test.x)
+
+model
+plot(varImp(model, useModel = F, scale=F))
+plot(model)
+max(model$results$RMSE)
+
+# RT
+data.frame(
+   RMSE = RMSE(seqs.test.predicted_y, seqs.test.observed_y),
+   Rsquare = R2(seqs.test.predicted_y, seqs.test.observed_y)
+)
diff --git a/ccn2019/ccn2019-rt.R b/ccn2019/ccn2019-rt.R
new file mode 100644
index 0000000..4f6defb
--- /dev/null
+++ b/ccn2019/ccn2019-rt.R
@@ -0,0 +1,117 @@
+#==================================================#
+# model the "RT" column
+
+library(here)
+library(tidyverse)
+library(caret)
+library(inspectdf)
+library(skimr)
+library(ROSE)
+
+load(here("notebooks/data/nback_seqs.Rd"))
+
+set.seed(42)
+
+seqs.imputed <- seqs %>%
+  filter(!is.na(correct), !is.na(rt)) %>%
+  mutate(correct=factor(correct,labels=c("INCORRECT","CORRECT")))
+
+#DEBUG inspect_num(seqs.imputed)
+
+#seqs.dummy <- predict(dummyVars(~.,data=seqs.imputed),seqs.imputed)
+
+
+#DEBUG train_indexes <- createResample(seqs.imputed$cr,list=F)[,1]
+
+train_indexes <- createDataPartition(seqs.imputed$correct,
+                                     times = 1,
+                                     p = 0.7,
+                                     list = F)
+train_data <- seqs.imputed[train_indexes,]
+test_data <- seqs.imputed[-train_indexes,]
+
+train_data.imbalanced <- ROSE(correct ~ .,
+                            data = train_data,
+                            seed = 1)$data
+
+# VIsualize split
+train_data.imbalanced$grp <- "train"
+test_data$grp <- "test"
+
+rbind(train_data.imbalanced, test_data) %>% 
+  ggplot(aes(x=correct, fill=grp)) +
+  geom_histogram(stat="count", position='dodge') +
+  labs(title="Imbalanced Split")
+
+control <- trainControl(
+  method = "repeatedcv",
+  number = 5,
+  repeats = 2,
+  verboseIter = T,
+  savePredictions = T
+)
+
+train_data <- train_data.imbalanced %>% select(-grp)
+
+pls.new_model <- train(
+  rt ~ .-a-al-cr-dp-rt-correct,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+plot(pls.new_model)
+summary(pls.new_model)
+plot(varImp(pls.new_model), main="Reaction Time - Variable Importance")
+
+pls.common_model <- train(
+  cr ~ .-a-al-dp-cr-rt-correct-tl-l-ll-s-sl-ul-vl,
+  data = train_data,
+  method = "pls",
+  preProcess = c("zv","center","scale"),
+  trControl = control
+)
+
+summary(pls.common_model)
+plot(varImp(pls.common_model), main="Criterion - Variable Importance (Common Model)")
+
+trellis.par.set(caretTheme())
+densityplot(pls.new_model, pch = "|")
+densityplot(pls.common_model, pch = "|")
+
+pls.models <- resamples(list(new = pls.new_model, common = pls.common_model))
+summary(pls.models)
+dotplot(pls.models, metric = "Rsquared")
+diffValues <- diff(pls.models)
+bwplot(diffValues, layout=c(1,3))
+
+
+pls.new_train_predicted <- predict(pls.new_model, train_data, type="raw")
+pls.common_train_predicted <- predict(pls.common_model, train_data, type="raw")
+pls.new_predicted <- predict(pls.new_model, test_data, type="raw")
+pls.common_predicted <- predict(pls.common_model, test_data, type="raw")
+
+# SSE and RMSE
+# 
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)    # sum of squared errors
+# SST <- sum((test_data$cr - mean(train_data$cr))^2) # total sum of squares, remember to use training data here
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.new_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.new_predicted))
+# 
+# 
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# R_square <- 1 - SSE/SST
+# SSE <- sum((test_data$cr - pls.common_predicted)^2)
+# RMSE <- sqrt(SSE/length(pls.common_predicted))
+# 
+
+as.data.frame(cbind(predicted = pls.common_predicted, observed = test_data$cr)) %>%
+  ggplot(aes(predicted, observed)) +
+    coord_cartesian(xlim = c(-5, -3), ylim = c(-5, -3)) +
+    geom_point(alpha = 0.1,shape=16) + 
+    geom_smooth(method=glm) +
+    ggtitle("Criterion: Predicted vs Actual") +
+    xlab("Predecited") +
+    ylab("Observed")
diff --git a/ccn2019/ccn2019-svm.R b/ccn2019/ccn2019-svm.R
new file mode 100644
index 0000000..a8f4a30
--- /dev/null
+++ b/ccn2019/ccn2019-svm.R
@@ -0,0 +1,79 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+f <- correct ~ n + stimulus_type + stimulus + t + s + v + l + vl + sl + tl + ul + ll
+
+set.seed(654321)
+
+seqs <- seqs %>%drop_na(rt, correct, tl,sl)
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+
+ctrl <- trainControl(method="cv",
+                     number=10,
+#                     repeats = 1, #repeatedcv
+                     sampling = "up",
+                     savePredictions = T,
+                     verbose = T)
+
+tune <- expand.grid(C = seq(0,5,by=0.25))
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "svmLinear",
+               preProc = c("nzv","center", "scale"),
+               tuneLength = 10,
+               tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
+
+# RT
+# data.frame(
+#   RMSE = RMSE(y.test, seqs.test$correct),
+#   Rsquare = R2(y.test, seqs.test$correct)
+# )
+
diff --git a/ccn2019/ccn2019.rev0.Rmd b/ccn2019/ccn2019.rev0.Rmd
new file mode 100644
index 0000000..9220ff4
--- /dev/null
+++ b/ccn2019/ccn2019.rev0.Rmd
@@ -0,0 +1,581 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: inline
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(plsRglm)
+library(plsdof)
+library(caret)
+```
+
+```{r params}
+load('./data/CL2015.RData')
+
+window_size <- 8
+```
+
+
+
+```{r history}
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+with_history <- function(stimuli, length=16, fixed=F) {
+  seq <- paste(stimuli, collapse = '')
+  
+  sapply(1:length(stimuli), function(i) {
+    stri_reverse(str_sub(seq, max(1,i-length+1), i))
+  })
+  #ifelse(fixed, h[str_length(h)==size], h)
+}
+
+# $x_{s,local}$
+with_skewness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    sum(sort(freqs, decreasing = T)[1:2]) - 1
+  })
+}
+
+# $x_{u,local}$
+with_lumpiness <- function(history) {
+  sapply(history, function(h) {
+    freqs <- table(unlist(str_split(h,"")))
+    max(freqs) - 1
+  })
+}
+
+
+with_targets_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="target"]) / length(trials)
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, length=16) {
+  sapply(1:length(stimulus_type), function(i) {
+    trials <- stimulus_type[max(1,i-length):i]
+    length(trials[trials=="lure"]) / length(trials)
+  })
+}
+
+#TODO change to list column workflow with broom for model fitting and evaluating the fits
+# duh! we are using list nested insided a tibble, so put all new columns in a new list column
+# instead of adding a new column for each.
+NB2 <- NB %>%
+  group_by(participant, condition, block) %>%
+  nest() %>% unnest(data) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(history = with_history(stimulus, window_size)) %>%
+  mutate(x_sl = with_skewness(history)) %>%
+  mutate(x_ul = with_lumpiness(history)) %>%
+  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
+  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
+  ungroup()
+  
+pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
+NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# caret
+library(caret)
+# Compile cross-validation settings
+
+
+any(is.na(NB2))
+NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+
+plsResult
+```
+
+<!--chapter:end:ccn2019.rev2.Rmd-->
+
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
+
+<!--chapter:end:ccn2019.Rmd-->
+
+---
+title: "PLS Training"
+output: html_notebook
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```{r plsrglm}
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
+
+<!--chapter:end:pls_playground.Rmd-->
+
diff --git a/ccn2019/ccn2019.rev1.Rmd b/ccn2019/ccn2019.rev1.Rmd
new file mode 100644
index 0000000..9074227
--- /dev/null
+++ b/ccn2019/ccn2019.rev1.Rmd
@@ -0,0 +1,281 @@
+---
+title: "Evaluating N-Back Sequences"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+library(tidyverse)
+library(ggplot2)
+library(stringi)
+library(GA)
+library(dbscan)
+library(inspectdf)
+
+load('./data/CL2015.RData')
+```
+
+### Variables
+- $T$ number of targets
+- $L$ number of lures
+- $S$ Skewness score
+- $U$ Uniformity (!repetition)
+- $RT_{mean}$
+- $Accuracy_{mean}$
+- $dprime$
+- $criterion$
+
+## Constraints
+
+- fixed number of targets
+- fixed number of lures (a.k.a, foils)
+- uniform distribution of choices
+- controlled local lumpiness
+
+
+Each constraint is an up side down quadratic function to be minimized.
+
+```{r, eval=F}
+targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
+skewness_cost <- function(x, choices) {
+  uniform_ratio <- length(x) / length(choices)
+  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
+  for (c in choices) {
+    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
+  }
+  #TODO convert to gaussian loss
+  max(deviation_from_uniform)
+}
+
+lumpiness_cost <- function(x, choices) {
+  #trials = len(seq)
+  #freqs = [float(seq.count(c)) for c in choices]
+  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
+  #return ralph_skewed
+  NA
+}
+
+#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
+#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
+#plot(GA)
+```
+
+
+```{r}
+
+with_lures <- function(condition, stim, stim_type, history = NA) {
+  sapply(1:length(stim), 
+    function(i) {
+      switch(as.character(condition[i]),
+             "2-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )},
+             "3-back" = {
+               ifelse(
+                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
+                 'lure', 
+                 as.character(stim_type[i])
+               )}
+      )
+
+    })
+}
+
+with_targets_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-stri_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="target"])
+  })
+}
+
+with_lures_ratio <- function(stimulus_type, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus_type[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    length(trials[trials=="lure"])
+  })
+}
+
+with_lumpiness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    max(table(trials)) - 1
+  })
+}
+
+with_lag <- function(stimulus, history) {
+  # find last occurance the of stimulus
+}
+
+with_skewness_score <- function(stimulus, history) {
+  sapply(1:length(history), function(i) {
+    trials <- stimulus[(i-str_length(history[i])):i]
+    trials <- unlist(trials, use.names=FALSE)
+    sum(sort(table(trials), decreasing = T)[1:2]) - 1
+  })
+}
+
+with_history <- function(stims, size=16) {
+  res <- c('')
+  for (i in 2:length(stims)) {
+    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
+  }
+  #res <- ifelse(stri_length(res)==size, res, NA)
+  res
+}
+
+normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
+  #TODO
+  sapply(1:length(targets_ratio), function(i) 0)
+}
+
+window_size <- 8
+
+NB_modified <- NB %>%
+  group_by(participant, condition, block) %>%
+  mutate(history = with_history(stimulus, size=window_size)) %>%
+  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
+  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
+  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
+  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
+  mutate(skewness = with_skewness_score(stimulus, history)) %>%
+  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
+  filter(stri_length(history)==window_size) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
+  ungroup()
+
+pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)
+
+NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+
+## participant-level averaged NB, a single row represent an observation for a single subject
+## in a single condition
+NB_avg <- NB_modified %>%
+  group_by(participant, condition) %>%
+  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
+  summarise(
+    targets=sum(targets), 
+    lures=sum(lures), 
+    skewness=sum(skewness), 
+    lumpiness=sum(lumpiness), 
+    rt = mean(rt, na.rm=T), 
+    correct=sum(correct,na.rm=T)/90) %>%
+  ungroup()
+
+# print
+# NB_modified %>%
+#   filter(participant=='P1') %>%
+#   View()
+#   
+  
+
+fit <- lm(correct ~ t * s * u * l * d, NB_modified)
+
+```
+
+
+```{r}
+
+# DBSCAN Clustering (RT+ACCURACY against skewness)
+NB_avg <- NB_avg %>%
+  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)
+
+NB_avg %>%
+  ggplot(aes(skewness, correct, color=factor(cluster))) +
+  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+```
+
+
+```{r}
+## single-subject figures
+NB_modified %>%
+  ggplot(aes(t,s,color=correct)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  inspect_cor(show_plot = T)
+
+NB_avg %>%
+  inspect_cor(show_plot = T)
+
+NB_modified %>%
+  ggplot(aes(rt,correct,color=u)) +
+  geom_jitter() +
+  geom_point() +
+  stat_summary(fun.y="mean")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(t),color=correct)) +
+    geom_jitter() +
+    geom_point(alpha=0.1)
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(s),jitter(u),color=correct)) +
+    geom_jitter() +
+    geom_point() +
+    facet_wrap(~condition)
+
+# rt/accuracy and lures
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
+    geom_jitter() +
+    geom_point(shape=16) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,pc2,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,rt,color=correct)) +
+    geom_point(alpha=0.3) +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+NB_modified %>%
+  filter(!is.na(correct)) %>%
+  ggplot(aes(pc1,correct,color=correct)) +
+    geom_point() +
+    geom_smooth(method="lm",se = F) + 
+    facet_wrap(~condition, scales="free")
+
+
+```
+
+## TODO
+  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
+  - constraint3=fitness(history))
+  - kmeans(NB)
+  - ggplot(kmeans_clusters$accuracy) 
+  - ggplot(kmeans_clusters$rt)
+  - 
+
+
+
+
+```{python}
+a=2
+p
+```
diff --git a/ccn2019/ccn2019.rev2.Rmd b/ccn2019/ccn2019.rev2.Rmd
new file mode 100644
index 0000000..7e19f45
--- /dev/null
+++ b/ccn2019/ccn2019.rev2.Rmd
@@ -0,0 +1,229 @@
+---
+title: "Statistical Properties of the N-Back Sequences"
+output:
+  html_notebook: default
+  pdf_document: default
+editor_options: 
+  chunk_output_type: console
+---
+
+# Problems
+
+Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.
+
+# Gaps
+
+- Unclear how to parameterize interesting variations for sequence generation
+- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
+    - local vs. global properties, which one matters the most?
+    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
+    - Global:  No lures, large vocabulary -> pattern repeats implies a target
+
+
+## Formulating Generating the N-Back Sequences as a CSP instance
+
+$P=\langle V,D,C,W\rangle$
+
+$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$
+
+$D=\{\}$
+
+
+Constraints:
+
+$$
+\\
+
+x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|
+
+\\\\
+
+x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|
+
+\\\\
+
+x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|
+
+\\\\
+
+x_{l} = L \times trials
+\\\\
+
+x_{ll} = L \times w
+\\\\
+
+x_{v} = |V|
+\\
+
+x_{ul} = w
+\\\\
+
+x_{s} = {trials \over |V|}
+\\\\
+
+x_{sl} = max(1, {w \over |V|})
+\\\\
+
+x_{g} = {trials \over w}
+
+\\\\
+
+x_{vl} = min(|V|, w)
+$$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+#library(plsRglm)
+#library(plsdof)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+
+```
+
+```{r preprocessing}
+
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide_dbl(stimulus_type, ~length(which(.=='target')), .partial=T,.size=window_size),
+         ll = slide_dbl(stimulus_type, ~length(which(.=='lure')), .partial=T, .size=window_size),
+         sl = slide_dbl(stimulus, ~(sum(sort(table(.), decreasing = T)[1:2]) - 1), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), 0, sl),
+         ul = slide_dbl(stimulus, ~(max(table(.))-1), .partial=T, .size=window_size),
+         vl = slide_dbl(stimulus, ~(length(unique(.))), .partial=T, .size=window_size),
+         al = slide_dbl(correct, ~(length(which(.))), .partial=T, .size=window_size)) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map(local_stats, ~length(unique(.x$stimulus)))) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition)
+
+View()
+inspectdf::inspect_cor(seqs)
+#inspect_cor(NB,show_plot = T)
+```
+
+```{r}
+model1 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a,data=., family = "gaussian")
+aug1 <- augment(model1)
+aug1 %>%
+  ggplot(aes(a,rt)) +
+    geom_point() +
+    geom_smooth(aes(y=.fitted, color='red'))
+```
+
+```{r}
+model2 <- NB2 %>%
+  select(-participant, -stimulus) %>%
+  glm(rt~t+n+a+al+s+sl+ll+l,data=., family = "gaussian")
+aug2 <- augment(model2)
+aug2 %>%
+  ggplot(aes(jitter(al),rt)) +
+    geom_point(alpha=0.2,shape=18) +
+    xlab("accuracy") + 
+    geom_smooth(aes(y=.fitted), color='blue') +
+    geom_smooth(aes(y=aug1$.fitted), color='red')
+  
+```
+
+```{r models}
+
+nb_split <- initial_split(NB2, prop = 0.75)
+training_data <- training(nb_split)
+testing_data <- testing(nb_split)
+cv_split <- vfold_cv(training_data, v = 5)
+cv_data <- cv_split %>% 
+  mutate(
+    train = map(splits, ~training(.x)), 
+    validate = map(splits, ~testing(.x))
+  )
+
+cv_models_lm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_glm_a <- cv_data %>% 
+  mutate(model = map(train, ~lm(formula = a~., data = .x)),
+         tidied = map(model, tidy),
+         glanced = map(model, glance),
+         augment = map(model, augment))
+
+cv_models_pls_a <- cv_data %>%
+  mutate(model = map(train, ~plsr(a~., data = .x, validation = "CV")),
+         best_dims = map_dbl(model, ~which.min(RMSEP(.x)$val[estimate="adjCV",,]) - 1)) %>%
+  mutate(model = map(train, ~plsr(a ~ ., data = .x, ncomp = best_dims))
+  )
+
+head(cv_models_pls_a)
+
+
+cv_models_pls_a1 <- cv_data[3][[1]]
+
+
+NBx <- NB %>%
+  group_by(participant) %>%
+  summarise(freq = as.data.frame(table(stimulus)))
+
+ggplot(NBx$freq, aes(, group=participant)) +
+  geom_point(se = F)
+
+
+#%>%
+#  mutate(model.pls = map(variables, ~plsr(rt ~ ., data = .x, validation = "CV")))
+  #mutate(model.pca = map(design_matrix, ~prcomp(~rt,.x, center=T,scale.=T, na.action=na.exclude))) %>%
+  #mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])
+
+# Compile cross-validation settings
+
+#any(is.na(NB2))
+#NB2 <- na.omit(NB2)
+
+# set.seed(100)
+# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
+# 
+# # PLS
+# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
+#  method = "pls",
+#  metric = "Accuracy",
+#  tuneLength = 20,
+#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
+#  preProc = c("zv","center","scale"))
+#  
+# # Check CV
+# plot(mod1)
+
+
+#plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
+#plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)
+
+```
diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd
new file mode 100644
index 0000000..ba1b312
--- /dev/null
+++ b/ccn2019/ccn2019.rev3.Rmd
@@ -0,0 +1,100 @@
+
+$P=\langle V,D,C,W \rangle$
+
+```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
+#! ===============================================
+#! load required packages
+
+library(ggplot2)
+library(tidyverse)
+library(stringi)
+library(pls)
+library(caret)
+library(here)
+library(tsibble)
+library(broom)
+library(rsample)
+library(inspectdf)
+library(caTools)
+library(pROC)
+
+#! ===============================================
+#! load data set and set running window size
+load(here('notebooks/data/CL2015.RData'))
+window_size <- 8
+
+```
+
+
+```{r preprocessing}
+
+#! ===============================================
+#! A function to mark lures in a sequence
+with_lures <- function(stimulus, stimulus_type, n) {
+  sapply(1:length(stimulus), function(i) {
+    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
+    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
+    ifelse(are_valid_trials && stimulus[i] %in% lures,
+           "lure", 
+           as.character(stimulus_type[i]))
+  })
+}
+
+#! ===============================================
+#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
+#! a and al are respectively accuracy and recent accuracy
+seqs <- NB %>%
+  group_by(participant, block, condition) %>%
+  mutate(n = ifelse(condition=='2-back',2,3)) %>%
+  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
+  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
+         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
+         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
+         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
+         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
+         sl = ifelse(is.na(sl), NA, sl),
+         tl = ifelse(is.na(tl), NA, tl),
+         ll = ifelse(is.na(ll), NA, ll),
+         al = ifelse(is.na(al), NA, al),
+         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
+) %>%
+  nest(.key='local_stats') %>%
+  #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
+  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
+  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
+  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
+  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
+  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
+  # mutate(dp = map_dbl(local_stats,
+  #                     ~length(which(.x$stimulus_type=="target" & .x$correct)) -
+  #                       length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
+  # mutate(cr = map_dbl(local_stats,
+  #                     ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
+  #                       length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
+  mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>%
+  #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
+  ungroup() %>%
+  select(-participant,-block,-condition) %>%
+  unnest(local_stats) %>%
+  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
+  mutate(stimulus = factor(stimulus)) %>%
+  mutate(stimulus_type = factor(stimulus_type))
+
+save(seqs,file=here("notebooks/data/nback_seqs.Rd"))
+
+#! ===============================================
+#! visualize correlations
+#DEBUG inspect_cor(seqs, show_plot = T)
+
+
+```
+
+```{r remove_highly_correlated_predictors}
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+cor_matrix <- cor(seqs[,-1])
+cor_high <- findCorrelation(cor_matrix, 0.8)
+high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+seqs.uncorr <- seqs %>% select(-high_cor_remove)
+```
diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R
new file mode 100644
index 0000000..7da6569
--- /dev/null
+++ b/ccn2019/ccn2019_diagrams.R
@@ -0,0 +1,51 @@
+NB_avg %>%
+  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
+  ggplot(aes(targets, accuracy, color=factor(cluster))) +
+  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  #geom_smooth(method='lm', se = F) +
+  facet_wrap(~condition)
+
+
+NB_avg %>%
+  ggplot(aes(lures, accuracy, color=condition)) +
+  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(skewness, accuracy, color=condition)) +
+  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, accuracy, color=condition)) +
+  ggtitle("lumpiness", "NOTE: each point is a single participant") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+NB_avg %>%
+  ggplot(aes(lumpiness, rts, color=condition)) +
+  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
+  xlab("lumpiness") +
+  ylab("Average RT") +
+  geom_point(alpha=0.3) +
+  geom_smooth(method='lm', se = F)
+
+nback <- NB_modified
+
+nback %>%
+  mutate(block=as.factor(block)) %>%
+  mutate(trial=as.factor(trial)) %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  #filter(condition=='3-back') %>%
+  #mutate(correct=as.numeric(correct)) %>%
+  inspect_cor(show_plot = T)
+
+averaged_nback <- NB_avg
+
+averaged_nback %>%
+  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
+  inspect_cor(show_plot = T)
+
diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R
new file mode 100644
index 0000000..9bbe23a
--- /dev/null
+++ b/ccn2019/dummy-vars-playground.R
@@ -0,0 +1,111 @@
+library(tidyverse)
+library(caret)
+library(here)
+library(inspectdf)
+library(glmnet)
+library(ROSE)
+
+rm(seqs)
+load(here("notebooks/data/nback_seqs.Rd"))
+
+# seqs %>%
+#   ggplot(aes(x=v,y=a,col=correct)) +
+#   geom_jitter() +
+#   geom_point(alpha=0.1) +
+#   geom_smooth()
+
+f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus
+f <- correct ~ n + t + v + stimulus
+
+set.seed(654321)
+
+# 1. dummy vars
+# INPUTS : seqs
+# OUTPUTS: seqs.dmy
+
+seqs <- seqs %>% 
+  drop_na(rt, correct, tl,sl)
+
+
+train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
+
+seqs.train.balanced <- seqs[train.indices,]
+seqs.train <- seqs.train.balanced
+# seqs.train <- ROSE(f, data = seqs.train.balanced)$data
+
+seqs.train.x <- model.matrix(f, seqs.train)[,-1]
+seqs.train.y <- seqs.train[[toString(f[[2]])]]
+
+seqs.test  <- seqs[-train.indices,]
+seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
+seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
+  
+
+# ROC for each var
+filterVarImp(as.data.frame(seqs.train.x), seqs.train.y)
+
+# model <- cv.glmnet(seqs.train.x,
+#                    seqs.train.y,
+#                    alpha = 1,
+#                    nfolds = 5,
+#                    family = "binomial",
+#                    type.measure = "auc")
+# 
+# model$lambda.min
+
+ctrl <- trainControl(method="cv",
+                     number=1, 
+                     classProbs=T,
+                     verbose = T,
+#                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+# glmnet tune
+tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100))
+
+max_components <- n_distinct(attr(terms(f),"term.labels"))
+# pls tune
+tune <- expand.grid(ncomp=1:max_components)
+
+model <- train(seqs.train.x,
+               seqs.train.y, 
+               method = "glmnet",
+               #family = "binomial",
+               #metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               #verboseIter = TRUE,
+               tuneLength = 2,
+               #tuneGrid = tune,
+               trControl = ctrl)
+
+model$bestTune
+plot(model)
+
+seqs.test.y <- model %>% predict(seqs.test.x)
+seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
+
+confusionMatrix(seqs.test.y, seqs.test.observed_y)
+
+plot(varImp(model, scale = F, useModel = F))
+
+library(pROC)
+
+
+roc(seqs.test.observed_y,
+    seqs.test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "auc",
+    boot.n = 100,
+    ci = T)
+
+
+
diff --git a/ccn2019/pls_playground.Rmd b/ccn2019/pls_playground.Rmd
new file mode 100644
index 0000000..c566656
--- /dev/null
+++ b/ccn2019/pls_playground.Rmd
@@ -0,0 +1,104 @@
+---
+title: "PLS Training"
+output: html_notebook
+editor_options: 
+  chunk_output_type: console
+---
+
+PLS:
+
+
+```{r}
+#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
+library(tidyverse)
+library(pls)
+
+## 1. load sample data
+#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
+
+rm(NB)
+load("./data/CL2015.RData")
+data <- NB
+str(data)
+
+## 2. clean data (remove brand and URLID)
+data <- data %>% 
+  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
+  select(-condition,
+         -stimulus,
+         -block,
+         -trial)
+# %>%
+#   rename(
+#          ev.participant=participant,
+#          ev.n=n,
+#          ev.block=block,
+#          ev.stimulus_type=stimulus_type,
+#          rv.choice=choice,
+#          rv.rt=rt,
+#          rv.correct=correct
+#          )
+
+## 3. use cross validatation to find the optimal number of dimensions
+pls.model = plsr(rt ~ ., data = data, validation = "CV")
+
+## 3.1. find the model with lowest cv error
+
+best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1
+
+## 4. rebuild the model
+pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
+
+## 5. Sort, and visualize top coefficients
+coefs <- coef(pls.model)
+
+barplot(sort(coefs[,1,1], decreasing = T)[1:4])
+```
+
+
+```{r simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+pls.model <- plsr(Y ~ X, validation = "CV")
+
+cv <- RMSEP(pls.model)
+best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
+pls.model <- plsr(Y ~ X, ncomp = best_dims)
+coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
+
+barplot(coefs)
+
+```
+
+
+```{r cca-simulate}
+X <- matrix(rnorm(1100), 100, 11)
+Y <- matrix(rnorm(400), 100, 4)
+
+M <- cor(cbind(X,Y))
+corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
+cc <- cancor(X, Y)
+
+#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
+
+```
+
+
+```
+rm(list = ls())
+library(plsRglm)
+
+data(Cornell)
+df <- Cornell
+x <- subset(df, select = -c(Y))
+y <- df$Y
+## K is the number of folds in CV, and nt is the maximum number of components, 
+#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
+
+modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
+res.cv.modpls<-cvtable(summary(cv.modpls))
+
+res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
+
+```
\ No newline at end of file
diff --git a/ccn2019_diagrams.R b/ccn2019_diagrams.R
deleted file mode 100644
index 7da6569..0000000
--- a/ccn2019_diagrams.R
+++ /dev/null
@@ -1,51 +0,0 @@
-NB_avg %>%
-  mutate(cluster = dbscan::dbscan(cbind(accuracy,rts), eps = 0.5, minPts = 3)$cluster) %>%
-  ggplot(aes(targets, accuracy, color=factor(cluster))) +
-  ggtitle("targets (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  #geom_smooth(method='lm', se = F) +
-  facet_wrap(~condition)
-
-
-NB_avg %>%
-  ggplot(aes(lures, accuracy, color=condition)) +
-  ggtitle("lures (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(skewness, accuracy, color=condition)) +
-  ggtitle("skewness (window = 8 trials)", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(lumpiness, accuracy, color=condition)) +
-  ggtitle("lumpiness", "NOTE: each point is a single participant") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-NB_avg %>%
-  ggplot(aes(lumpiness, rts, color=condition)) +
-  ggtitle("lumpiness (window = 8 trials)", "NOTE: each point is a single participant") +
-  xlab("lumpiness") +
-  ylab("Average RT") +
-  geom_point(alpha=0.3) +
-  geom_smooth(method='lm', se = F)
-
-nback <- NB_modified
-
-nback %>%
-  mutate(block=as.factor(block)) %>%
-  mutate(trial=as.factor(trial)) %>%
-  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
-  #filter(condition=='3-back') %>%
-  #mutate(correct=as.numeric(correct)) %>%
-  inspect_cor(show_plot = T)
-
-averaged_nback <- NB_avg
-
-averaged_nback %>%
-  mutate(condition=ifelse(condition=='2-back',2,3)) %>%
-  inspect_cor(show_plot = T)
-
diff --git a/dummy-vars-playground.R b/dummy-vars-playground.R
deleted file mode 100644
index d79b629..0000000
--- a/dummy-vars-playground.R
+++ /dev/null
@@ -1,114 +0,0 @@
-library(tidyverse)
-library(caret)
-library(here)
-library(inspectdf)
-library(glmnet)
-library(ROSE)
-
-rm(seqs)
-load(here("notebooks/data/nback_seqs.Rd"))
-
-# seqs %>%
-#   ggplot(aes(x=v,y=a,col=correct)) +
-#   geom_jitter() +
-#   geom_point(alpha=0.1) +
-#   geom_smooth()
-
-f <- correct ~ n + stimulus + stimulus_type + t + s + v + l + vl + sl + tl + ul + ll
-
-set.seed(654321)
-
-# 1. dummy vars
-# INPUTS : seqs
-# OUTPUTS: seqs.dmy
-
-seqs <- seqs %>% 
-  drop_na(rt, correct, tl,sl)
-
-
-train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE)
-
-seqs.train.balanced <- seqs[train.indices,]
-seqs.train <- seqs.train.balanced
-# seqs.train <- ROSE(f, data = seqs.train.balanced)$data
-
-seqs.train.x <- model.matrix(f, seqs.train)[,-1]
-seqs.train.y <- seqs.train[[toString(f[[2]])]]
-
-seqs.test  <- seqs[-train.indices,]
-seqs.test.x <-  model.matrix(f, seqs.test)[,-1]
-seqs.test.observed_y <- seqs.test[[toString(f[[2]])]]
-  
-
-# ROC for each var
-filterVarImp(as.data.frame(seqs.train.x), seqs.train.y)
-
-# model <- cv.glmnet(seqs.train.x,
-#                    seqs.train.y,
-#                    alpha = 1,
-#                    nfolds = 5,
-#                    family = "binomial",
-#                    type.measure = "auc")
-# 
-# model$lambda.min
-
-ctrl <- trainControl(method="cv",
-                     number=5, 
-                     classProbs=T,
-                     verbose = T,
-#                     sampling = "up",
-                     savePredictions = T,
-                     summaryFunction=twoClassSummary)
-
-# glmnet tune
-tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100))
-
-max_components <- n_distinct(attr(terms(f),"term.labels"))
-# pls tune
-tune <- expand.grid(ncomp=1:max_components)
-
-model <- train(seqs.train.x,
-               seqs.train.y, 
-               method = "pls",
-               family = "binomial",
-               metric = "ROC",
-               preProc = c("center", "scale"),
-               verboseIter = TRUE,
-               tuneLength = 10,
-               tuneGrid = tune,
-               trControl = ctrl)
-
-model$bestTune
-plot(model)
-
-seqs.test.y <- model %>% predict(seqs.test.x)
-seqs.test.y_prob <- model %>% predict(seqs.test.x, type="prob")
-
-confusionMatrix(seqs.test.y, seqs.test.observed_y)
-
-plot(varImp(model, scale = F, useModel = F))
-
-library(pROC)
-
-roc(seqs.test.observed_y,
-    seqs.test.y_prob$YES,
-    legacy.axes=T,
-    plot = T,
-    lwd=2,
-    col="black",
-    print.auc=T,
-    percent = T,
-    print.auc.y = 40,
-    print.auc.x = 55,
-    lty = 1,
-    of = "se",
-    boot.n = 200,
-    ci = T)
-
-
-# RT
-# data.frame(
-#   RMSE = RMSE(y.test, seqs.test$correct),
-#   Rsquare = R2(y.test, seqs.test$correct)
-# )
-
diff --git a/pls_playground.Rmd b/pls_playground.Rmd
deleted file mode 100644
index c566656..0000000
--- a/pls_playground.Rmd
+++ /dev/null
@@ -1,104 +0,0 @@
----
-title: "PLS Training"
-output: html_notebook
-editor_options: 
-  chunk_output_type: console
----
-
-PLS:
-
-
-```{r}
-#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
-library(tidyverse)
-library(pls)
-
-## 1. load sample data
-#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")
-
-rm(NB)
-load("./data/CL2015.RData")
-data <- NB
-str(data)
-
-## 2. clean data (remove brand and URLID)
-data <- data %>% 
-  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
-  select(-condition,
-         -stimulus,
-         -block,
-         -trial)
-# %>%
-#   rename(
-#          ev.participant=participant,
-#          ev.n=n,
-#          ev.block=block,
-#          ev.stimulus_type=stimulus_type,
-#          rv.choice=choice,
-#          rv.rt=rt,
-#          rv.correct=correct
-#          )
-
-## 3. use cross validatation to find the optimal number of dimensions
-pls.model = plsr(rt ~ ., data = data, validation = "CV")
-
-## 3.1. find the model with lowest cv error
-
-best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1
-
-## 4. rebuild the model
-pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)
-
-## 5. Sort, and visualize top coefficients
-coefs <- coef(pls.model)
-
-barplot(sort(coefs[,1,1], decreasing = T)[1:4])
-```
-
-
-```{r simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-pls.model <- plsr(Y ~ X, validation = "CV")
-
-cv <- RMSEP(pls.model)
-best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
-pls.model <- plsr(Y ~ X, ncomp = best_dims)
-coefs <- sort(coef(pls.model)[,1,1], decreasing = T)
-
-barplot(coefs)
-
-```
-
-
-```{r cca-simulate}
-X <- matrix(rnorm(1100), 100, 11)
-Y <- matrix(rnorm(400), 100, 4)
-
-M <- cor(cbind(X,Y))
-corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
-cc <- cancor(X, Y)
-
-#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)
-
-```
-
-
-```
-rm(list = ls())
-library(plsRglm)
-
-data(Cornell)
-df <- Cornell
-x <- subset(df, select = -c(Y))
-y <- df$Y
-## K is the number of folds in CV, and nt is the maximum number of components, 
-#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)
-
-modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
-res.cv.modpls<-cvtable(summary(cv.modpls))
-
-res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)
-
-```
\ No newline at end of file