$P=\langle V,D,C,W \rangle$ ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} #! =============================================== #! load required packages library(ggplot2) library(tidyverse) library(stringi) library(pls) library(caret) library(here) library(tsibble) library(broom) library(rsample) library(inspectdf) library(caTools) library(pROC) #! =============================================== #! load data set and set running window size load(here('data/CL2015.RData')) window_size <- 8 ``` ```{r preprocessing} #! =============================================== #! A function to mark lures in a sequence with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i]))) ifelse(are_valid_trials && stimulus[i] %in% lures, "lure", as.character(stimulus_type[i])) }) } #! =============================================== #! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al #! a and al are respectively accuracy and recent accuracy seqs <- NB %>% group_by(participant, block, condition) %>% mutate(n = ifelse(condition=='2-back',2,3)) %>% mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% mutate(tl = slide2_dbl(stimulus_type, rt, ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), ll = slide2_dbl(stimulus_type, rt, ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), sl = slide_dbl(stimulus_type, ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), ul = slide_dbl(stimulus, ~max(table(.))-1, .partial=T, .size=window_size), vl = slide_dbl(stimulus, ~length(unique(.)), .partial=T, .size=window_size), al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), sl = ifelse(is.na(sl), 0, sl), tl = ifelse(is.na(tl), NA, tl), ll = ifelse(is.na(ll), NA, ll), al = ifelse(is.na(al), NA, al) ) %>% nest(.key='local_stats') %>% #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% mutate(dp = map_dbl(local_stats, ~length(which(.x$stimulus_type=="target" & .x$correct)) - length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% mutate(cr = map_dbl(local_stats, ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% ungroup() %>% select(-participant,-block,-condition) %>% unnest(local_stats) #! =============================================== #! visualize correlations #DEBUG inspect_cor(seqs, show_plot = T) save(seqs,file=here("data/nback_seqs.Rd")) ``` ```{r remove_highly_correlated_predictors} # WIP: This is an extra step for non-pls methods to remove highly correlated predictors cor_matrix <- cor(seqs[,-1]) cor_high <- findCorrelation(cor_matrix, 0.8) high_cor_remove <- row.names(cor_matrix)[cor_high] #FIXME remove by column name seqs.uncorr <- seqs %>% select(-high_cor_remove) ``` ```{r models} #! =============================================== #! prepare data for modeling (remove na, etc) seqs <- seqs %>% filter(!is.na(correct), !is.na(rt)) %>% mutate(correct=factor(as.numeric(correct),labels=c("INCORRECT","CORRECT"))) #mutate(correct=as.numeric(correct)) #FIXME remove outcomes before dummy out the data and imputing missing values # replace factors with dummy data #seqs.dummy <- predict(dummyVars(~.,data=seqs[,-1]),seqs[,-1]) # impute missing values #seqs.imputed <- predict(preProcess(seqs.dummy, "bagImpute"), seqs.dummy) #DEBUG View(seqs.imputed) #! =============================================== #! split into train/test (consider class-inbalence for the correct) set.seed(42) train_indexes <- createDataPartition(seqs$correct, times = 1, p = 0.7, list = F) seqs.train <- seqs[train_indexes,] seqs.test <- seqs[-train_indexes,] #! =============================================== #! training parameters for the PLS models plsTrControl <- trainControl( method = "cv", number = 5, verboseIter = TRUE ) #==================================================# # Train PLS model (block-level accuracy) pls.fit.accuracy <- train( a ~ .-rt-al-correct-dp-cr, data = seqs.train, method = "pls", tuneLength = 20, trControl = plsTrControl, preProc = c("center","scale")) # Check CV profile plot(pls.fit.accuracy) # PLS variable importance plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance") pls.fit.accuracy.predicted <- predict(pls.fit.accuracy, seqs.test, type="raw") ggplot(seqs.test, aes(pls.fit.accuracy.predicted, a)) + geom_point() #==================================================# # Train PLS model (rt) train_data_x <- seqs.train %>% select(-rt,-a,-correct) # all except rt train_data_y <- (seqs.train %>% select(rt))$rt # only rt pls.fit.rt <- train( train_data_x, train_data_y, method = "pls", tuneLength = 20, trControl = plsTrControl, preProc = c("center","scale")) # Check CV profile plot(pls.fit.rt) # PLS variable importance plot(varImp(pls.fit.rt), main="RT - Variable Importance") pls.predicted.rt <- predict(pls.fit.rt, seqs.test) #FIXME measure performance with accuracy, etc. #confusionMatrix(pls.predicted.rt, seqs.test$rt) #colAUC(pls.predicted.rt, seqs.test$rt, plotROC=T) #==================================================# ## OLD MODEL (only global features) pls.fit.accuracy.old <- train( a ~ n+t+v, data = seqs.train, method = "pls", tuneLength = 20, trControl = plsTrControl, preProc = c("center","scale")) # Check CV profile plot(pls.fit.accuracy.old) # PLS variable importance plot(varImp(pls.fit.accuracy.old), main="Accuracy (old) - Variable Importance") pls.fit.accuracy.old.predicted <- predict(pls.fit.accuracy.old, seqs.test, type="raw") ggplot(seqs.test, aes(pls.fit.accuracy.old.predicted, a)) + geom_point() #FIXME confusionMatrix(glm.fit.predicted.old, test_data$correct, ) colAUC(glm.fit.predicted.old, test_data$correct, plotROC=T) #TODO use pROC to viz AUX roc(test_data$correct,glm.fit.predicted.old) ```