```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
#! ===============================================
#! load required packages
library(ggplot2)
library(tidyverse)
library(stringi)
library(pls)
library(caret)
library(here)
library(tsibble)
library(broom)
library(rsample)
library(inspectdf)
library(caTools)
library(pROC)
#! ===============================================
#! load data set and set running window size
load(here('data/CL2015.RData'))
window_size <- 8
```
```{r preprocessing}
#! ===============================================
#! A function to mark lures in a sequence
with_lures <- function(stimulus, stimulus_type, n) {
sapply(1:length(stimulus), function(i) {
lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
ifelse(are_valid_trials && stimulus[i] %in% lures,
"lure",
as.character(stimulus_type[i]))
})
}
#! ===============================================
#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al
#! a and al are respectively accuracy and recent accuracy
seqs <- NB %>%
group_by(participant, block, condition) %>%
mutate(n = ifelse(condition=='2-back',2,3)) %>%
mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
mutate(tl = slide2_dbl(stimulus_type, rt, ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
ll = slide2_dbl(stimulus_type, rt, ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
sl = slide_dbl(stimulus_type, ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
ul = slide_dbl(stimulus, ~max(table(.))-1, .partial=T, .size=window_size),
vl = slide_dbl(stimulus, ~length(unique(.)), .partial=T, .size=window_size),
al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
sl = ifelse(is.na(sl), 0, sl),
tl = ifelse(is.na(tl), NA, tl),
ll = ifelse(is.na(ll), NA, ll),
al = ifelse(is.na(al), NA, al)
) %>%
nest(.key='local_stats') %>%
#mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>%
mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
mutate(dp = map_dbl(local_stats,
~length(which(.x$stimulus_type=="target" & .x$correct)) -
length(which(.x$stimulus_type!="target" & !.x$correct)))) %>%
mutate(cr = map_dbl(local_stats,
~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) +
length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>%
mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>%
#mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>%
ungroup() %>%
select(-participant,-block,-condition) %>%
unnest(local_stats)
#! ===============================================
#! visualize correlations
#DEBUG inspect_cor(seqs, show_plot = T)
save(seqs,file=here("data/nback_seqs.Rd"))
```
```{r remove_highly_correlated_predictors}
# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
cor_matrix <- cor(seqs[,-1])
cor_high <- findCorrelation(cor_matrix, 0.8)
high_cor_remove <- row.names(cor_matrix)[cor_high]
#FIXME remove by column name
seqs.uncorr <- seqs %>% select(-high_cor_remove)
```
```{r models}
#! ===============================================
#! prepare data for modeling (remove na, etc)
seqs <- seqs %>%
filter(!is.na(correct), !is.na(rt)) %>%
mutate(correct=factor(as.numeric(correct),labels=c("I","C")))
#mutate(correct=as.numeric(correct))
#FIXME remove outcomes before dummy out the data and imputing missing values
# replace factors with dummy data
#seqs.dummy <- predict(dummyVars(~.,data=seqs[,-1]),seqs[,-1])
# impute missing values
#seqs.imputed <- predict(preProcess(seqs.dummy, "bagImpute"), seqs.dummy)
#DEBUG View(seqs.imputed)
#! ===============================================
#! split into train/test (consider class-inbalence for the correct)
set.seed(42)
train_indexes <- createDataPartition(seqs$correct,
times = 1,
p = 0.7,
list = F)
seqs.train <- seqs[train_indexes,]
seqs.test <- seqs[-train_indexes,]
#! ===============================================
#! training parameters for the PLS models
plsTrControl <- trainControl(
method = "cv",
number = 5,
verboseIter = TRUE
)
#==================================================#
# Train PLS model (block-level accuracy)
pls.fit.accuracy <- train(
a ~ .-rt-al-correct-dp-cr,
data = seqs.train,
method = "pls",
tuneLength = 20,
trControl = plsTrControl,
preProc = c("center","scale"))
# Check CV profile
plot(pls.fit.accuracy)
# PLS variable importance
plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance")
pls.fit.accuracy.predicted <- predict(pls.fit.accuracy, seqs.test, type="raw")
ggplot(seqs.test, aes(pls.fit.accuracy.predicted, a)) +
geom_point()
#==================================================#
# Train PLS model (rt)
train_data_x <- seqs.train %>% select(-rt,-a,-correct) # all except rt
train_data_y <- (seqs.train %>% select(rt))$rt # only rt
pls.fit.rt <- train(
train_data_x,
train_data_y,
method = "pls",
tuneLength = 20,
trControl = plsTrControl,
preProc = c("center","scale"))
# Check CV profile
plot(pls.fit.rt)
# PLS variable importance
plot(varImp(pls.fit.rt), main="RT - Variable Importance")
pls.predicted.rt <- predict(pls.fit.rt, seqs.test)
#FIXME measure performance with accuracy, etc.
#confusionMatrix(pls.predicted.rt, seqs.test$rt)
#colAUC(pls.predicted.rt, seqs.test$rt, plotROC=T)
#==================================================#
## OLD MODEL (only global features)
pls.fit.accuracy.old <- train(
a ~ n+t+v,
data = seqs.train,
method = "pls",
tuneLength = 20,
trControl = plsTrControl,
preProc = c("center","scale"))
# Check CV profile
plot(pls.fit.accuracy.old)
# PLS variable importance
plot(varImp(pls.fit.accuracy.old), main="Accuracy (old) - Variable Importance")
pls.fit.accuracy.old.predicted <- predict(pls.fit.accuracy.old, seqs.test, type="raw")
ggplot(seqs.test, aes(pls.fit.accuracy.old.predicted, a)) +
geom_point()
#FIXME
confusionMatrix(glm.fit.predicted.old, test_data$correct, )
colAUC(glm.fit.predicted.old, test_data$correct, plotROC=T)
#TODO use pROC to viz AUX roc(test_data$correct,glm.fit.predicted.old)
```