Newer
Older
notebooks / ccn2019 / ccn2019.rev3.Rmd
---
output: html_document
editor_options: 
  chunk_output_type: inline
---

```{r setup, message=FALSE, include=FALSE, paged.print=FALSE}
#! ===============================================
#! load required packages

# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))
library(ggplot2)
library(tidyverse)
library(stringi)
library(pls)
library(caret)
library(here)
library(tsibble)
library(broom)
library(rsample)
library(inspectdf)
library(caTools)
library(pROC)
library(glmnet)
library(ROSE)

#! ===============================================
#! load data set and set running window size
load(here('data/CL2015.RData'))
window_size <- 8

```

# Preprocessing

The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.

```{r}
with_lures <- function(stimulus, stimulus_type, n) {
  sapply(1:length(stimulus), function(i) {
    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
    ifelse(are_valid_trials && stimulus[i] %in% lures,
           "lure", 
           as.character(stimulus_type[i]))
  })
}
```

After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences.
The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.

The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.

```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE}

#! ===============================================
#! A function to mark lures in a sequence

#! ===============================================
#! Extract t,tl,l,ll,u,ul,s,sl,a,al
#! a and al are respectively accuracy and recent accuracy
seqs.raw <- NB %>%
  group_by(participant) %>%
  mutate(rt = scale(rt)) %>%
  ungroup() %>%
  group_by(participant, block, condition) %>%
  mutate(n = ifelse(condition=='2-back',2,3)) %>%
  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
  mutate(tl = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size),
         ll = slide2_dbl(lag(stimulus_type), lag(rt), ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size),
         sl = slide_dbl(lag(stimulus_type), ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size),
         ul = slide_dbl(lag(stimulus), ~max(table(.x))-1, .partial=T, .size=window_size),
         vl = slide_dbl(lag(stimulus), ~n_distinct(.x,na.rm=T), .partial=T, .size=window_size),
         al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size),
         sl = ifelse(is.na(sl), NA, sl),
         tl = ifelse(is.na(tl), NA, tl),
         ll = ifelse(is.na(ll), NA, ll),
         al = ifelse(is.na(al), NA, al),
         ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul)
) %>%
  nest(.key='local_stats') %>%
  mutate(a  = map_dbl(local_stats, ~length(which(.x$correct)))) %>%
  mutate(t  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>%
  mutate(l  = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>%
  mutate(s  = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>%
  mutate(v  = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>%
  mutate(local_stats = map(local_stats, ~.x %>% dplyr::select(-trial,-choice))) %>%
  ungroup() %>%
  #dplyr::select(-block,-condition) %>%
  unnest(local_stats) %>%
  mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>%
  mutate(stimulus = factor(stimulus)) %>%
  mutate(stimulus_type = factor(stimulus_type)) %>%
  mutate(participant = factor(participant)) %>%
  ungroup()

save(seqs.raw,file=here("data/nback_seqs.Rd"))
#load(here("notebooks/data/nback_seqs.Rd"))

# seqs is the main dataset of sequences including all stats extracted from a block or the sliding history window.
## classify RTs (mid and high), low RTs will be removed for now
seqs <- seqs.raw %>% 
  mutate(
    rt_cat=cut(rt,breaks=c(-Inf,-1,+1,+Inf), labels=c("low","mid","high")),
  ) %>%
  filter(rt_cat!="low") %>%
  mutate(rt_cat = droplevels(rt_cat)) %>%
  drop_na(rt, rt_cat, correct, tl,sl) %>%
  filter(participant != "P25", participant != "P40") # participants with low accuracy are removed (P25, P40)

```

The following chunk marks highly correlated predictors, but does not work anymore.
```{r remove_highly_correlated_predictors}
# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
#cor_matrix <- cor(seqs[,-1])
#cor_high <- findCorrelation(cor_matrix, 0.8)
#high_cor_remove <- row.names(cor_matrix)[cor_high]
#FIXME remove by column name
#seqs.uncorr <- seqs %>% select(-high_cor_remove)
```


`split_data()` is a function to split the cleaned up dataset into training and test. If `.balance` parameter is set to `TRUE` then the training set is oversampled with regard to the formula.

```{r split_data}

split_data <- function(f, data, .balance=F, p = 0.8) {
  train.indices <- createDataPartition(data[[toString(f[[2]])]], 
                                       p = p, 
                                       list =F)
  
  .train.imbalanced <- data[train.indices,]
  
  .train <- ifelse(.balance, 
                   ROSE(f, data = .train.imbalanced)$data,
                   seqs.train.imbalanced)
  
  .train.x <- model.matrix(f, seqs.train)[,-1]

  .train.y <- seqs.train[[toString(f[[2]])]]
  .test  <- seqs[-train.indices,]
  .test.x <-  model.matrix(f, seqs.test)[,-1]
  .test.observed_y <- seqs.test[[toString(f[[2]])]]
  return(list(
    "train" = .train,
    "train.x" = .train.x,
    "train.y" = .train.y,
    "test" = .test,
    "test.x" = .test.x,
    "test.observed_y" = .test.observed_y
  ))
}

```


## Feature Selection

the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe `rt_cat`. To change parameters, modify the `f` formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.

```{r rfe_feature_selection}
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat
#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat

splt <- split_data(f, seqs)

ctrl <- rfeControl(functions = rfFuncs,
                   method = "cv",
                   number = 3,
                   verbose = T)
rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)

message("Optimal Variables: ")
rfeProfile$optVariables

# ROC for each categories
filterVarImp(as.data.frame(splt$train.x), splt$train.y)

```


## Analysis of Correctness

Correctness refers to the `correct` column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).

The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.

In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.

Note: uncomment desired `old.f` and `new.f` formulas to run respective analysis.

```{r comapre_correct_models}

#1. atypical local features lead to longer response time.
#old.f <- rt_cat ~ n + t + v + rt_cat
#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat

old.f <- correct ~ n + t
new.f <- correct ~ n + t + rt_cat

#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat

#2. local features provide a better prediction for the correctness of choice
#old.f <- correct ~ n + t + v
#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll

#3. Longer 

ctrl <- trainControl(method="cv",
                     number=5, 
                     classProbs=T,
                     verbose = T,
                     sampling = "up",
                     savePredictions = T,
                     summaryFunction=twoClassSummary)
  
train_model <- function(f, data, ctrl) {
  splt <- split_data(f, data, .balance = T)
  model <- train(splt$train.x,
               splt$train.y, 
               method = "pls",
               family = "binomial",
               metric = "ROC",
               preProc = c("nzv","center", "scale"),
               verboseIter = TRUE,
               tuneLength = 5,
               #tuneGrid = tune,
               trControl = ctrl)
  list(model  = model,
       test.y = model %>% predict(splt$test.x),
       test.y_prob = model %>% predict(splt$test.x, type="prob"),
       test.observed_y = splt$test.observed_y
  )
}

old.model <- seqs %>%
  #filter(rt_cat == "high") %>%
  train_model(old.f, ., ctrl)

new.model <- seqs %>%
  #filter(rt_cat == "high") %>%
  train_model(new.f, ., ctrl)

bwplot(resamples(list(old=old.model$model, new=new.model$model)))
densityplot(resamples(list(old=old.model$model, new=new.model$model)))

confusionMatrix(old.model$test.y, old.model$test.observed_y)
plot(varImp(old.model$model, scale = F, useModel = F))

confusionMatrix(new.model$test.y, new.model$test.observed_y)
plot(varImp(new.model$model, scale = F, useModel = F))

roc(old.model$test.observed_y,
    old.model$test.y_prob$YES,
    legacy.axes=T,
    plot = T,
    lwd=2,
    col="gray",
    print.auc=T,
    percent = T,
    print.auc.y = 40,
    print.auc.x = 55,
    lty = 1,
    of = "se",
    boot.n = 100,
    ci = T)

roc(new.model$test.observed_y,
    new.model$test.y_prob$YES,
    legacy.axes=T,
    add = T,
    plot = T,
    lwd=2,
    col="black",
    print.auc=T,
    percent = T,
    print.auc.y = 50,
    print.auc.x = 55,
    lty = 1,
    of = "se",
    boot.n = 100,
    ci = T)

```


## RT Analysis

```{r comapre_rt_models}
f <- rt_cat ~ n + t + v
f <- rt_cat ~ n + v + s + tl

splt <- split_data(f, seqs)

ctrl <- trainControl(method="cv",
                     number=5, 
                     classProbs=T,
                     verbose = T,
                     #sampling = "up",
                     savePredictions = T,
                     summaryFunction=twoClassSummary)

max_components <- n_distinct(attr(terms(f),"term.labels")) - 2

# grid tune
tune <- expand.grid(ncomp=2:max_components)

model1 <- train(splt$train.x,
               splt$train.y, 
               method = "pls",
               family = "binomial",
               metric = "ROC",
               preProc = c("nzv","center", "scale"),
               verboseIter = TRUE,
               #tuneLength = 7,
               tuneGrid = tune,
               trControl = ctrl)

# validate model1
splt$test.y <- model1 %>% predict(splt$test.x)
splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")

confusionMatrix(splt$test.y, splt$test.observed_y)
plot(varImp(model1, scale = F, useModel = F))

roc(splt$test.observed_y,
    splt$test.y_prob$mid,
    legacy.axes=T,
    plot = T,
    lwd=2,
    col="black",
    print.auc=T,
    percent = T,
    print.auc.y = 0,
    print.auc.x = 80,
    lty = 1,
    of = "se",
    boot.n = 200,
    ci = T)

```



```{r predict_rt}
```