--- title: "Statistical Properties of the N-Back Sequences" output: html_notebook: default pdf_document: default editor_options: chunk_output_type: inline --- # Problems Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task. # Gaps - Unclear how to parameterize interesting variations for sequence generation - How do we model these multiple strategies (which requires identifying which sequence variations matter) - local vs. global properties, which one matters the most? - Local: lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity - Global: No lures, large vocabulary -> pattern repeats implies a target ## Formulating Generating the N-Back Sequences as a CSP instance $P=\langle V,D,C,W\rangle$ $V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$ $D=\{\}$ Constraints: $$ \\ x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)| \\\\ x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)| \\\\ x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)| \\\\ x_{l} = L \times trials \\\\ x_{ll} = L \times w \\\\ x_{v} = |V| \\ x_{ul} = w \\\\ x_{s} = {trials \over |V|} \\\\ x_{sl} = max(1, {w \over |V|}) \\\\ x_{g} = {trials \over w} \\\\ x_{vl} = min(|V|, w) $$ ```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE} library(ggplot2) library(tidyverse) library(stringi) library(plsRglm) library(plsdof) library(caret) ``` ```{r params} load('./data/CL2015.RData') window_size <- 8 ``` ```{r history} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) ifelse(are_valid_trials && stimulus[i] %in% lures, "lure", as.character(stimulus_type[i])) }) } with_history <- function(stimuli, length=16, fixed=F) { seq <- paste(stimuli, collapse = '') sapply(1:length(stimuli), function(i) { stri_reverse(str_sub(seq, max(1,i-length+1), i)) }) #ifelse(fixed, h[str_length(h)==size], h) } # $x_{s,local}$ with_skewness <- function(history) { sapply(history, function(h) { freqs <- table(unlist(str_split(h,""))) sum(sort(freqs, decreasing = T)[1:2]) - 1 }) } # $x_{u,local}$ with_lumpiness <- function(history) { sapply(history, function(h) { freqs <- table(unlist(str_split(h,""))) max(freqs) - 1 }) } with_targets_ratio <- function(stimulus_type, length=16) { sapply(1:length(stimulus_type), function(i) { trials <- stimulus_type[max(1,i-length):i] length(trials[trials=="target"]) / length(trials) }) } with_lures_ratio <- function(stimulus_type, length=16) { sapply(1:length(stimulus_type), function(i) { trials <- stimulus_type[max(1,i-length):i] length(trials[trials=="lure"]) / length(trials) }) } #TODO change to list column workflow with broom for model fitting and evaluating the fits # duh! we are using list nested insided a tibble, so put all new columns in a new list column # instead of adding a new column for each. NB2 <- NB %>% group_by(participant, condition, block) %>% nest() %>% unnest(data) %>% mutate(n = ifelse(condition=='2-back',2,3)) %>% mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% mutate(history = with_history(stimulus, window_size)) %>% mutate(x_sl = with_skewness(history)) %>% mutate(x_ul = with_lumpiness(history)) %>% mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>% mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>% ungroup() pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude) NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) # caret library(caret) # Compile cross-validation settings any(is.na(NB2)) NB2 <- na.omit(NB2) # set.seed(100) # trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10) # # # PLS # mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")], # method = "pls", # metric = "Accuracy", # tuneLength = 20, # trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"), # preProc = c("zv","center","scale")) # # # Check CV # plot(mod1) plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3) plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3) plsResult ``` <!--chapter:end:ccn2019.rev2.Rmd--> --- title: "Evaluating N-Back Sequences" --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE) library(tidyverse) library(ggplot2) library(stringi) library(GA) library(dbscan) library(inspectdf) load('./data/CL2015.RData') ``` ### Variables - $T$ number of targets - $L$ number of lures - $S$ Skewness score - $U$ Uniformity (!repetition) - $RT_{mean}$ - $Accuracy_{mean}$ - $dprime$ - $criterion$ ## Constraints - fixed number of targets - fixed number of lures (a.k.a, foils) - uniform distribution of choices - controlled local lumpiness Each constraint is an up side down quadratic function to be minimized. ```{r, eval=F} targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4) skewness_cost <- function(x, choices) { uniform_ratio <- length(x) / length(choices) deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices) for (c in choices) { deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio) } #TODO convert to gaussian loss max(deviation_from_uniform) } lumpiness_cost <- function(x, choices) { #trials = len(seq) #freqs = [float(seq.count(c)) for c in choices] #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3) #return ralph_skewed NA } #merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x) #GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10) #plot(GA) ``` ```{r} with_lures <- function(condition, stim, stim_type, history = NA) { sapply(1:length(stim), function(i) { switch(as.character(condition[i]), "2-back" = { ifelse( stim[i]==stri_sub(history[i],-2,-2) || stim[i]==stri_sub(history[i],-4,-4), 'lure', as.character(stim_type[i]) )}, "3-back" = { ifelse( stim[i]==stri_sub(history[i],-3,-3) || stim[i]==stri_sub(history[i],-5,-5), 'lure', as.character(stim_type[i]) )} ) }) } with_targets_ratio <- function(stimulus_type, history) { sapply(1:length(history), function(i) { trials <- stimulus_type[(i-stri_length(history[i])):i] trials <- unlist(trials, use.names=FALSE) length(trials[trials=="target"]) }) } with_lures_ratio <- function(stimulus_type, history) { sapply(1:length(history), function(i) { trials <- stimulus_type[(i-str_length(history[i])):i] trials <- unlist(trials, use.names=FALSE) length(trials[trials=="lure"]) }) } with_lumpiness_score <- function(stimulus, history) { sapply(1:length(history), function(i) { trials <- stimulus[(i-str_length(history[i])):i] trials <- unlist(trials, use.names=FALSE) max(table(trials)) - 1 }) } with_lag <- function(stimulus, history) { # find last occurance the of stimulus } with_skewness_score <- function(stimulus, history) { sapply(1:length(history), function(i) { trials <- stimulus[(i-str_length(history[i])):i] trials <- unlist(trials, use.names=FALSE) sum(sort(table(trials), decreasing = T)[1:2]) - 1 }) } with_history <- function(stims, size=16) { res <- c('') for (i in 2:length(stims)) { res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size) } #res <- ifelse(stri_length(res)==size, res, NA) res } normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) { #TODO sapply(1:length(targets_ratio), function(i) 0) } window_size <- 8 NB_modified <- NB %>% group_by(participant, condition, block) %>% mutate(history = with_history(stimulus, size=window_size)) %>% #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures)) mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>% mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>% mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>% mutate(skewness = with_skewness_score(stimulus, history)) %>% mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>% filter(stri_length(history)==window_size) %>% mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>% ungroup() pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE) NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2']) ## participant-level averaged NB, a single row represent an observation for a single subject ## in a single condition NB_avg <- NB_modified %>% group_by(participant, condition) %>% mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>% summarise( targets=sum(targets), lures=sum(lures), skewness=sum(skewness), lumpiness=sum(lumpiness), rt = mean(rt, na.rm=T), correct=sum(correct,na.rm=T)/90) %>% ungroup() # print # NB_modified %>% # filter(participant=='P1') %>% # View() # fit <- lm(correct ~ t * s * u * l * d, NB_modified) ``` ```{r} # DBSCAN Clustering (RT+ACCURACY against skewness) NB_avg <- NB_avg %>% mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster) NB_avg %>% ggplot(aes(skewness, correct, color=factor(cluster))) + ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") + geom_point(alpha=0.3) + #geom_smooth(method='lm', se = F) + facet_wrap(~condition) ``` ```{r} ## single-subject figures NB_modified %>% ggplot(aes(t,s,color=correct)) + geom_jitter() + geom_point() + stat_summary(fun.y="mean") NB_modified %>% inspect_cor(show_plot = T) NB_avg %>% inspect_cor(show_plot = T) NB_modified %>% ggplot(aes(rt,correct,color=u)) + geom_jitter() + geom_point() + stat_summary(fun.y="mean") NB_modified %>% filter(!is.na(correct)) %>% ggplot(aes(jitter(s),jitter(t),color=correct)) + geom_jitter() + geom_point(alpha=0.1) NB_modified %>% filter(!is.na(correct)) %>% ggplot(aes(jitter(s),jitter(u),color=correct)) + geom_jitter() + geom_point() + facet_wrap(~condition) # rt/accuracy and lures NB_modified %>% filter(!is.na(correct)) %>% ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) + geom_jitter() + geom_point(shape=16) + geom_smooth(method="lm",se = F) + facet_wrap(~condition, scales="free") NB_modified %>% filter(!is.na(correct)) %>% ggplot(aes(pc1,pc2,color=correct)) + geom_point() + geom_smooth(method="lm",se = F) + facet_wrap(~condition, scales="free") NB_modified %>% filter(!is.na(correct)) %>% ggplot(aes(pc1,rt,color=correct)) + geom_point(alpha=0.3) + geom_smooth(method="lm",se = F) + facet_wrap(~condition, scales="free") NB_modified %>% filter(!is.na(correct)) %>% ggplot(aes(pc1,correct,color=correct)) + geom_point() + geom_smooth(method="lm",se = F) + facet_wrap(~condition, scales="free") ``` ## TODO - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history), - constraint3=fitness(history)) - kmeans(NB) - ggplot(kmeans_clusters$accuracy) - ggplot(kmeans_clusters$rt) - ```{python} a=2 p ``` <!--chapter:end:ccn2019.Rmd--> --- title: "PLS Training" output: html_notebook --- PLS: ```{r} #detach("package:MASS","plsdof") # to avoid conflict with dplyr::select library(tidyverse) library(pls) ## 1. load sample data #data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv") rm(NB) load("./data/CL2015.RData") data <- NB str(data) ## 2. clean data (remove brand and URLID) data <- data %>% mutate(n=ifelse(condition=='2-back', 2, 3)) %>% select(-condition, -stimulus, -block, -trial) # %>% # rename( # ev.participant=participant, # ev.n=n, # ev.block=block, # ev.stimulus_type=stimulus_type, # rv.choice=choice, # rv.rt=rt, # rv.correct=correct # ) ## 3. use cross validatation to find the optimal number of dimensions pls.model = plsr(rt ~ ., data = data, validation = "CV") ## 3.1. find the model with lowest cv error cv <- RMSEP(pls.model) best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 ## 4. rebuild the model pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims) ## 5. Sort, and visualize top coefficients coefs <- coef(pls.model) barplot(sort(coefs[,1,1], decreasing = T)[1:4]) ``` ```{r simulate} X <- matrix(rnorm(1100), 100, 11) Y <- matrix(rnorm(400), 100, 4) pls.model <- plsr(Y ~ X, validation = "CV") cv <- RMSEP(pls.model) best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1 pls.model <- plsr(Y ~ X, ncomp = best_dims) coefs <- sort(coef(pls.model)[,1,1], decreasing = T) barplot(coefs) ``` ```{r cca-simulate} X <- matrix(rnorm(1100), 100, 11) Y <- matrix(rnorm(400), 100, 4) M <- cor(cbind(X,Y)) corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black") cc <- cancor(X, Y) #NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data) ``` ```{r plsrglm} rm(list = ls()) library(plsRglm) data(Cornell) df <- Cornell x <- subset(df, select = -c(Y)) y <- df$Y ## K is the number of folds in CV, and nt is the maximum number of components, #cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8) modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE) res.cv.modpls<-cvtable(summary(cv.modpls)) res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE) ``` <!--chapter:end:pls_playground.Rmd-->