diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R index 421a43f..51bdc8c 100644 --- a/ccn2019/ccn2019-feature-selection.R +++ b/ccn2019/ccn2019-feature-selection.R @@ -9,15 +9,15 @@ load(here("notebooks/data/nback_seqs.Rd")) seqs <- seqs %>% drop_na(rt, correct, tl,sl) -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + participant #f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type set.seed(654321) train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced +seqs.train.imbalanced <- seqs[train.indices,] +seqs.train <- seqs.train.imbalanced #if (toString(f[[2]]) == "correct") # seqs.train <- ROSE(f, data = seqs.train.balanced)$data @@ -40,7 +40,7 @@ number = 3, verbose = T) -model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) #model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) ctrl <- rfeControl(functions = rfFuncs, diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R index 421a43f..51bdc8c 100644 --- a/ccn2019/ccn2019-feature-selection.R +++ b/ccn2019/ccn2019-feature-selection.R @@ -9,15 +9,15 @@ load(here("notebooks/data/nback_seqs.Rd")) seqs <- seqs %>% drop_na(rt, correct, tl,sl) -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + participant #f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type set.seed(654321) train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced +seqs.train.imbalanced <- seqs[train.indices,] +seqs.train <- seqs.train.imbalanced #if (toString(f[[2]]) == "correct") # seqs.train <- ROSE(f, data = seqs.train.balanced)$data @@ -40,7 +40,7 @@ number = 3, verbose = T) -model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) #model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) ctrl <- rfeControl(functions = rfFuncs, diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index ba1b312..61a178e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,3 +1,8 @@ +--- +output: html_document +editor_options: + chunk_output_type: inline +--- $P=\langle V,D,C,W \rangle$ @@ -5,6 +10,7 @@ #! =============================================== #! load required packages +# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) library(ggplot2) library(tidyverse) library(stringi) @@ -17,19 +23,21 @@ library(inspectdf) library(caTools) library(pROC) +library(glmnet) +library(ROSE) #! =============================================== #! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) +load(here('data/CL2015.RData')) window_size <- 8 ``` +# Preprocessing -```{r preprocessing} +The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -#! =============================================== -#! A function to mark lures in a sequence +```{r} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -39,11 +47,25 @@ as.character(stimulus_type[i])) }) } +``` + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + +```{r preprocessing} #! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! A function to mark lures in a sequence + +#! =============================================== +#! Extract t,tl,l,ll,u,ul,s,sl,a,al #! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% +seqs.raw <- NB %>% + group_by(participant) %>% + mutate(rt = scale(rt)) %>% + ungroup() %>% group_by(participant, block, condition) %>% mutate(n = ifelse(condition=='2-back',2,3)) %>% mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% @@ -60,36 +82,37 @@ ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) ) %>% nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + mutate(local_stats = map(local_stats, ~.x %>% dplyr::select(-trial,-choice))) %>% ungroup() %>% - select(-participant,-block,-condition) %>% + #dplyr::select(-block,-condition) %>% unnest(local_stats) %>% mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) + mutate(stimulus_type = factor(stimulus_type)) %>% + mutate(participant = factor(participant)) %>% + ungroup() -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) +save(seqs.raw,file=here("data/nback_seqs.Rd")) +#load(here("notebooks/data/nback_seqs.Rd")) -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - +## classify RTs (mid and high), low RTs will be removed for now +seqs <- seqs.raw %>% + mutate( + rt_cat=cut(rt,breaks=c(-Inf,-1,+1,+Inf), labels=c("low","mid","high")), + ) %>% + filter(rt_cat!="low") %>% + mutate(rt_cat = droplevels(rt_cat)) %>% + drop_na(rt, rt_cat, correct, tl,sl) %>% + filter(participant != "P25", participant != "P40") # participants with low accuracy are removed (P25, P40) ``` +The following chunk marks highly correlated predictors, but does not work anymore. ```{r remove_highly_correlated_predictors} # WIP: This is an extra step for non-pls methods to remove highly correlated predictors cor_matrix <- cor(seqs[,-1]) @@ -98,3 +121,217 @@ #FIXME remove by column name seqs.uncorr <- seqs %>% select(-high_cor_remove) ``` + + +`split_data()` is a function to split the cleaned up dataset into training and test. If `.balance` parameter is set to `TRUE` then the training set is oversampled with regard to the formula. + +```{r split_data} + +split_data <- function(f, data, .balance=F, p = 0.8) { + train.indices <- createDataPartition(data[[toString(f[[2]])]], + p = p, + list =F) + + .train.imbalanced <- data[train.indices,] + + .train <- ifelse(.balance, + ROSE(f, data = .train.imbalanced)$data, + seqs.train.imbalanced) + + .train.x <- model.matrix(f, seqs.train)[,-1] + + .train.y <- seqs.train[[toString(f[[2]])]] + .test <- seqs[-train.indices,] + .test.x <- model.matrix(f, seqs.test)[,-1] + .test.observed_y <- seqs.test[[toString(f[[2]])]] + return(list( + "train" = .train, + "train.x" = .train.x, + "train.y" = .train.y, + "test" = .test, + "test.x" = .test.x, + "test.observed_y" = .test.observed_y + )) +} + +``` + + +## Feature Selection + +the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe `rt_cat`. To change parameters, modify the `f` formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor. + +```{r rfe_feature_selection} +#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll +f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll + +splt <- split_data(f, seqs) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) +rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +message("Optimal Variables: ") +rfeProfile$optVariables + +message("Model-Free Variable Importance") +# ROC for each categories +filterVarImp(as.data.frame(splt$train.x), splt$train.y) + +``` + + +## Analysis of Correctness + +Correctness refers to the `correct` column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence). + +The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice. + +In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models. + +```{r comapre_correct_models} + +old.f <- rt_cat ~ n + t + v +new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + +old.f <- correct ~ n + t + v + rt_cat +new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat + +ctrl <- trainControl(method="cv", + number=5, + classProbs=T, + verbose = T, + sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +train_model <- function(f, splt, ctrl) { + model <- train(splt$train.x, + splt$train.y, + method = "pls", + family = "binomial", + metric = "ROC", + preProc = c("nzv","center", "scale"), + verboseIter = TRUE, + tuneLength = 5, + #tuneGrid = tune, + trControl = ctrl) + list(model = model, + test.y = model %>% predict(splt$test.x), + test.y_prob = model %>% predict(splt$test.x, type="prob"), + test.observed_y = splt$test.observed_y + ) +} + +splt <- split_data(f, seqs) + + +old.model <- train_model(old.f, splt, ctrl) +new.model <- train_model(new.f, splt, ctrl) + +bwplot(resamples(list(old=old.model$model, new=new.model$model))) +densityplot(resamples(list(old=old.model$model, new=new.model$model))) + +confusionMatrix(old.model$test.y, old.model$test.observed_y) +plot(varImp(old.model$model, scale = F, useModel = F)) + +confusionMatrix(new.model$test.y, new.model$test.observed_y) +plot(varImp(new.model$model, scale = F, useModel = F)) + + +roc(old.model$test.observed_y, + old.model$test.y_prob$mid, + legacy.axes=T, + plot = T, + lwd=2, + col="gray", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 100, + ci = T) + +roc(new.model$test.observed_y, + new.model$test.y_prob$mid, + legacy.axes=T, + add = T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 50, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 100, + ci = T) + +``` + + +## Compare RT models (high vs average RT) + +```{r comapre_rt_models} +f <- rt_cat ~ n + t + v +f <- rt_cat ~ n + v + s + tl + +splt <- split_data(f, seqs) + +ctrl <- trainControl(method="cv", + number=5, + classProbs=T, + verbose = T, + #sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +max_components <- n_distinct(attr(terms(f),"term.labels")) - 2 + +# grid tune +tune <- expand.grid(ncomp=2:max_components) + +model1 <- train(splt$train.x, + splt$train.y, + method = "pls", + family = "binomial", + metric = "ROC", + preProc = c("nzv","center", "scale"), + verboseIter = TRUE, + #tuneLength = 7, + tuneGrid = tune, + trControl = ctrl) + +# validate model1 +splt$test.y <- model1 %>% predict(splt$test.x) +splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob") + +confusionMatrix(splt$test.y, splt$test.observed_y) +plot(varImp(model1, scale = F, useModel = F)) + +roc(splt$test.observed_y, + splt$test.y_prob$mid, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 0, + print.auc.x = 80, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + +``` + + + +```{r predict_rt} +``` \ No newline at end of file diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R index 421a43f..51bdc8c 100644 --- a/ccn2019/ccn2019-feature-selection.R +++ b/ccn2019/ccn2019-feature-selection.R @@ -9,15 +9,15 @@ load(here("notebooks/data/nback_seqs.Rd")) seqs <- seqs %>% drop_na(rt, correct, tl,sl) -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + participant #f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type set.seed(654321) train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced +seqs.train.imbalanced <- seqs[train.indices,] +seqs.train <- seqs.train.imbalanced #if (toString(f[[2]]) == "correct") # seqs.train <- ROSE(f, data = seqs.train.balanced)$data @@ -40,7 +40,7 @@ number = 3, verbose = T) -model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) #model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) ctrl <- rfeControl(functions = rfFuncs, diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index ba1b312..61a178e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,3 +1,8 @@ +--- +output: html_document +editor_options: + chunk_output_type: inline +--- $P=\langle V,D,C,W \rangle$ @@ -5,6 +10,7 @@ #! =============================================== #! load required packages +# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) library(ggplot2) library(tidyverse) library(stringi) @@ -17,19 +23,21 @@ library(inspectdf) library(caTools) library(pROC) +library(glmnet) +library(ROSE) #! =============================================== #! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) +load(here('data/CL2015.RData')) window_size <- 8 ``` +# Preprocessing -```{r preprocessing} +The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -#! =============================================== -#! A function to mark lures in a sequence +```{r} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -39,11 +47,25 @@ as.character(stimulus_type[i])) }) } +``` + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + +```{r preprocessing} #! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! A function to mark lures in a sequence + +#! =============================================== +#! Extract t,tl,l,ll,u,ul,s,sl,a,al #! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% +seqs.raw <- NB %>% + group_by(participant) %>% + mutate(rt = scale(rt)) %>% + ungroup() %>% group_by(participant, block, condition) %>% mutate(n = ifelse(condition=='2-back',2,3)) %>% mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% @@ -60,36 +82,37 @@ ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) ) %>% nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + mutate(local_stats = map(local_stats, ~.x %>% dplyr::select(-trial,-choice))) %>% ungroup() %>% - select(-participant,-block,-condition) %>% + #dplyr::select(-block,-condition) %>% unnest(local_stats) %>% mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) + mutate(stimulus_type = factor(stimulus_type)) %>% + mutate(participant = factor(participant)) %>% + ungroup() -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) +save(seqs.raw,file=here("data/nback_seqs.Rd")) +#load(here("notebooks/data/nback_seqs.Rd")) -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - +## classify RTs (mid and high), low RTs will be removed for now +seqs <- seqs.raw %>% + mutate( + rt_cat=cut(rt,breaks=c(-Inf,-1,+1,+Inf), labels=c("low","mid","high")), + ) %>% + filter(rt_cat!="low") %>% + mutate(rt_cat = droplevels(rt_cat)) %>% + drop_na(rt, rt_cat, correct, tl,sl) %>% + filter(participant != "P25", participant != "P40") # participants with low accuracy are removed (P25, P40) ``` +The following chunk marks highly correlated predictors, but does not work anymore. ```{r remove_highly_correlated_predictors} # WIP: This is an extra step for non-pls methods to remove highly correlated predictors cor_matrix <- cor(seqs[,-1]) @@ -98,3 +121,217 @@ #FIXME remove by column name seqs.uncorr <- seqs %>% select(-high_cor_remove) ``` + + +`split_data()` is a function to split the cleaned up dataset into training and test. If `.balance` parameter is set to `TRUE` then the training set is oversampled with regard to the formula. + +```{r split_data} + +split_data <- function(f, data, .balance=F, p = 0.8) { + train.indices <- createDataPartition(data[[toString(f[[2]])]], + p = p, + list =F) + + .train.imbalanced <- data[train.indices,] + + .train <- ifelse(.balance, + ROSE(f, data = .train.imbalanced)$data, + seqs.train.imbalanced) + + .train.x <- model.matrix(f, seqs.train)[,-1] + + .train.y <- seqs.train[[toString(f[[2]])]] + .test <- seqs[-train.indices,] + .test.x <- model.matrix(f, seqs.test)[,-1] + .test.observed_y <- seqs.test[[toString(f[[2]])]] + return(list( + "train" = .train, + "train.x" = .train.x, + "train.y" = .train.y, + "test" = .test, + "test.x" = .test.x, + "test.observed_y" = .test.observed_y + )) +} + +``` + + +## Feature Selection + +the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe `rt_cat`. To change parameters, modify the `f` formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor. + +```{r rfe_feature_selection} +#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll +f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll + +splt <- split_data(f, seqs) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) +rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +message("Optimal Variables: ") +rfeProfile$optVariables + +message("Model-Free Variable Importance") +# ROC for each categories +filterVarImp(as.data.frame(splt$train.x), splt$train.y) + +``` + + +## Analysis of Correctness + +Correctness refers to the `correct` column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence). + +The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice. + +In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models. + +```{r comapre_correct_models} + +old.f <- rt_cat ~ n + t + v +new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + +old.f <- correct ~ n + t + v + rt_cat +new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat + +ctrl <- trainControl(method="cv", + number=5, + classProbs=T, + verbose = T, + sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +train_model <- function(f, splt, ctrl) { + model <- train(splt$train.x, + splt$train.y, + method = "pls", + family = "binomial", + metric = "ROC", + preProc = c("nzv","center", "scale"), + verboseIter = TRUE, + tuneLength = 5, + #tuneGrid = tune, + trControl = ctrl) + list(model = model, + test.y = model %>% predict(splt$test.x), + test.y_prob = model %>% predict(splt$test.x, type="prob"), + test.observed_y = splt$test.observed_y + ) +} + +splt <- split_data(f, seqs) + + +old.model <- train_model(old.f, splt, ctrl) +new.model <- train_model(new.f, splt, ctrl) + +bwplot(resamples(list(old=old.model$model, new=new.model$model))) +densityplot(resamples(list(old=old.model$model, new=new.model$model))) + +confusionMatrix(old.model$test.y, old.model$test.observed_y) +plot(varImp(old.model$model, scale = F, useModel = F)) + +confusionMatrix(new.model$test.y, new.model$test.observed_y) +plot(varImp(new.model$model, scale = F, useModel = F)) + + +roc(old.model$test.observed_y, + old.model$test.y_prob$mid, + legacy.axes=T, + plot = T, + lwd=2, + col="gray", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 100, + ci = T) + +roc(new.model$test.observed_y, + new.model$test.y_prob$mid, + legacy.axes=T, + add = T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 50, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 100, + ci = T) + +``` + + +## Compare RT models (high vs average RT) + +```{r comapre_rt_models} +f <- rt_cat ~ n + t + v +f <- rt_cat ~ n + v + s + tl + +splt <- split_data(f, seqs) + +ctrl <- trainControl(method="cv", + number=5, + classProbs=T, + verbose = T, + #sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +max_components <- n_distinct(attr(terms(f),"term.labels")) - 2 + +# grid tune +tune <- expand.grid(ncomp=2:max_components) + +model1 <- train(splt$train.x, + splt$train.y, + method = "pls", + family = "binomial", + metric = "ROC", + preProc = c("nzv","center", "scale"), + verboseIter = TRUE, + #tuneLength = 7, + tuneGrid = tune, + trControl = ctrl) + +# validate model1 +splt$test.y <- model1 %>% predict(splt$test.x) +splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob") + +confusionMatrix(splt$test.y, splt$test.observed_y) +plot(varImp(model1, scale = F, useModel = F)) + +roc(splt$test.observed_y, + splt$test.y_prob$mid, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 0, + print.auc.x = 80, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + +``` + + + +```{r predict_rt} +``` \ No newline at end of file diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R index 9bbe23a..14b4ce3 100644 --- a/ccn2019/dummy-vars-playground.R +++ b/ccn2019/dummy-vars-playground.R @@ -7,31 +7,28 @@ rm(seqs) load(here("notebooks/data/nback_seqs.Rd")) +seqs.raw <- seqs -# seqs %>% -# ggplot(aes(x=v,y=a,col=correct)) + -# geom_jitter() + -# geom_point(alpha=0.1) + -# geom_smooth() -f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus -f <- correct ~ n + t + v + stimulus +seqs <- seqs.raw %>% + mutate( + rt_cat=cut(rt,breaks=c(-Inf,-1000,+500,+Inf), labels=c("low","mid","high")), + ) %>% + filter(rt_cat!="low") %>% + mutate( + rt_cat = droplevels(rt_cat) + ) %>% + drop_na(rt, rt_cat, correct, tl,sl) -set.seed(654321) - -# 1. dummy vars -# INPUTS : seqs -# OUTPUTS: seqs.dmy - -seqs <- seqs %>% - drop_na(rt, correct, tl,sl) - +f <- rt_cat ~ n + t + v +#f <- rt_cat ~ n + t + v + s + l + vl + sl + tl + ul + ll +#f <- correct ~ n + participant train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced -# seqs.train <- ROSE(f, data = seqs.train.balanced)$data +seqs.train.imbalanced <- seqs[train.indices,] +seqs.train <- seqs.train.imbalanced +seqs.train <- ROSE(f, data = seqs.train.balanced)$data seqs.train.x <- model.matrix(f, seqs.train)[,-1] seqs.train.y <- seqs.train[[toString(f[[2]])]] @@ -42,7 +39,7 @@ # ROC for each var -filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) +#filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) # model <- cv.glmnet(seqs.train.x, # seqs.train.y, @@ -57,26 +54,23 @@ number=1, classProbs=T, verbose = T, -# sampling = "up", + sampling = "up", savePredictions = T, summaryFunction=twoClassSummary) -# glmnet tune -tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) - -max_components <- n_distinct(attr(terms(f),"term.labels")) +max_components <- n_distinct(attr(terms(f),"term.labels")) -1 # pls tune -tune <- expand.grid(ncomp=1:max_components) +tune <- expand.grid(ncomp=2:max_components) model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - #family = "binomial", - #metric = "ROC", - preProc = c("nzv","center", "scale"), - #verboseIter = TRUE, + method = "pls", + family = "binomial", + metric = "ROC", + preProc = c("center", "scale"), + verboseIter = TRUE, tuneLength = 2, - #tuneGrid = tune, + tuneGrid = tune, trControl = ctrl) model$bestTune @@ -89,11 +83,8 @@ plot(varImp(model, scale = F, useModel = F)) -library(pROC) - - roc(seqs.test.observed_y, - seqs.test.y_prob$YES, + seqs.test.y_prob$mid, legacy.axes=T, plot = T, lwd=2, diff --git a/ccn2019/ccn2019-feature-selection.R b/ccn2019/ccn2019-feature-selection.R index 421a43f..51bdc8c 100644 --- a/ccn2019/ccn2019-feature-selection.R +++ b/ccn2019/ccn2019-feature-selection.R @@ -9,15 +9,15 @@ load(here("notebooks/data/nback_seqs.Rd")) seqs <- seqs %>% drop_na(rt, correct, tl,sl) -f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type +f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type + participant #f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll + stimulus_type set.seed(654321) train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced +seqs.train.imbalanced <- seqs[train.indices,] +seqs.train <- seqs.train.imbalanced #if (toString(f[[2]]) == "correct") # seqs.train <- ROSE(f, data = seqs.train.balanced)$data @@ -40,7 +40,7 @@ number = 3, verbose = T) -model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) +#model <- train(seqs.train.x, seqs.train.y, method = "glmStepAIC", trControl = ctrl) #model <- train(seqs.train.x, seqs.train.y, method = "ORFpls", trControl = ctrl) ctrl <- rfeControl(functions = rfFuncs, diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index ba1b312..61a178e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,3 +1,8 @@ +--- +output: html_document +editor_options: + chunk_output_type: inline +--- $P=\langle V,D,C,W \rangle$ @@ -5,6 +10,7 @@ #! =============================================== #! load required packages +# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) library(ggplot2) library(tidyverse) library(stringi) @@ -17,19 +23,21 @@ library(inspectdf) library(caTools) library(pROC) +library(glmnet) +library(ROSE) #! =============================================== #! load data set and set running window size -load(here('notebooks/data/CL2015.RData')) +load(here('data/CL2015.RData')) window_size <- 8 ``` +# Preprocessing -```{r preprocessing} +The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -#! =============================================== -#! A function to mark lures in a sequence +```{r} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -39,11 +47,25 @@ as.character(stimulus_type[i])) }) } +``` + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + +```{r preprocessing} #! =============================================== -#! Preprocess data set to add t,tl,l,ll,u,ul,s,sl,a,al +#! A function to mark lures in a sequence + +#! =============================================== +#! Extract t,tl,l,ll,u,ul,s,sl,a,al #! a and al are respectively accuracy and recent accuracy -seqs <- NB %>% +seqs.raw <- NB %>% + group_by(participant) %>% + mutate(rt = scale(rt)) %>% + ungroup() %>% group_by(participant, block, condition) %>% mutate(n = ifelse(condition=='2-back',2,3)) %>% mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>% @@ -60,36 +82,37 @@ ul = ifelse(is.na(ul) | is.infinite(ul), NA, ul) ) %>% nest(.key='local_stats') %>% - #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% mutate(t = map_dbl(local_stats, ~length(which(.x$stimulus_type=='target')))) %>% mutate(l = map_dbl(local_stats, ~length(which(.x$stimulus_type=='lure')))) %>% mutate(s = map_dbl(local_stats, ~sum(sort(table(.x$stimulus), decreasing = T)[1:2]) - 1)) %>% mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% - # mutate(dp = map_dbl(local_stats, - # ~length(which(.x$stimulus_type=="target" & .x$correct)) - - # length(which(.x$stimulus_type!="target" & !.x$correct)))) %>% - # mutate(cr = map_dbl(local_stats, - # ~-(length(which(.x$stimulus_type=="target" & .x$correct==T)) + - # length(which(.x$stimulus_type!="target" & .x$correct==F)))/2)) %>% - mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-choice))) %>% - #mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-choice))) %>% + mutate(local_stats = map(local_stats, ~.x %>% dplyr::select(-trial,-choice))) %>% ungroup() %>% - select(-participant,-block,-condition) %>% + #dplyr::select(-block,-condition) %>% unnest(local_stats) %>% mutate(correct = factor(as.numeric(correct), labels=c("NO","YES"))) %>% mutate(stimulus = factor(stimulus)) %>% - mutate(stimulus_type = factor(stimulus_type)) + mutate(stimulus_type = factor(stimulus_type)) %>% + mutate(participant = factor(participant)) %>% + ungroup() -save(seqs,file=here("notebooks/data/nback_seqs.Rd")) +save(seqs.raw,file=here("data/nback_seqs.Rd")) +#load(here("notebooks/data/nback_seqs.Rd")) -#! =============================================== -#! visualize correlations -#DEBUG inspect_cor(seqs, show_plot = T) - +## classify RTs (mid and high), low RTs will be removed for now +seqs <- seqs.raw %>% + mutate( + rt_cat=cut(rt,breaks=c(-Inf,-1,+1,+Inf), labels=c("low","mid","high")), + ) %>% + filter(rt_cat!="low") %>% + mutate(rt_cat = droplevels(rt_cat)) %>% + drop_na(rt, rt_cat, correct, tl,sl) %>% + filter(participant != "P25", participant != "P40") # participants with low accuracy are removed (P25, P40) ``` +The following chunk marks highly correlated predictors, but does not work anymore. ```{r remove_highly_correlated_predictors} # WIP: This is an extra step for non-pls methods to remove highly correlated predictors cor_matrix <- cor(seqs[,-1]) @@ -98,3 +121,217 @@ #FIXME remove by column name seqs.uncorr <- seqs %>% select(-high_cor_remove) ``` + + +`split_data()` is a function to split the cleaned up dataset into training and test. If `.balance` parameter is set to `TRUE` then the training set is oversampled with regard to the formula. + +```{r split_data} + +split_data <- function(f, data, .balance=F, p = 0.8) { + train.indices <- createDataPartition(data[[toString(f[[2]])]], + p = p, + list =F) + + .train.imbalanced <- data[train.indices,] + + .train <- ifelse(.balance, + ROSE(f, data = .train.imbalanced)$data, + seqs.train.imbalanced) + + .train.x <- model.matrix(f, seqs.train)[,-1] + + .train.y <- seqs.train[[toString(f[[2]])]] + .test <- seqs[-train.indices,] + .test.x <- model.matrix(f, seqs.test)[,-1] + .test.observed_y <- seqs.test[[toString(f[[2]])]] + return(list( + "train" = .train, + "train.x" = .train.x, + "train.y" = .train.y, + "test" = .test, + "test.x" = .test.x, + "test.observed_y" = .test.observed_y + )) +} + +``` + + +## Feature Selection + +the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe `rt_cat`. To change parameters, modify the `f` formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor. + +```{r rfe_feature_selection} +#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll +f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll + +splt <- split_data(f, seqs) + +ctrl <- rfeControl(functions = rfFuncs, + method = "cv", + number = 3, + verbose = T) +rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +message("Optimal Variables: ") +rfeProfile$optVariables + +message("Model-Free Variable Importance") +# ROC for each categories +filterVarImp(as.data.frame(splt$train.x), splt$train.y) + +``` + + +## Analysis of Correctness + +Correctness refers to the `correct` column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence). + +The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice. + +In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models. + +```{r comapre_correct_models} + +old.f <- rt_cat ~ n + t + v +new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + +old.f <- correct ~ n + t + v + rt_cat +new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat + +ctrl <- trainControl(method="cv", + number=5, + classProbs=T, + verbose = T, + sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +train_model <- function(f, splt, ctrl) { + model <- train(splt$train.x, + splt$train.y, + method = "pls", + family = "binomial", + metric = "ROC", + preProc = c("nzv","center", "scale"), + verboseIter = TRUE, + tuneLength = 5, + #tuneGrid = tune, + trControl = ctrl) + list(model = model, + test.y = model %>% predict(splt$test.x), + test.y_prob = model %>% predict(splt$test.x, type="prob"), + test.observed_y = splt$test.observed_y + ) +} + +splt <- split_data(f, seqs) + + +old.model <- train_model(old.f, splt, ctrl) +new.model <- train_model(new.f, splt, ctrl) + +bwplot(resamples(list(old=old.model$model, new=new.model$model))) +densityplot(resamples(list(old=old.model$model, new=new.model$model))) + +confusionMatrix(old.model$test.y, old.model$test.observed_y) +plot(varImp(old.model$model, scale = F, useModel = F)) + +confusionMatrix(new.model$test.y, new.model$test.observed_y) +plot(varImp(new.model$model, scale = F, useModel = F)) + + +roc(old.model$test.observed_y, + old.model$test.y_prob$mid, + legacy.axes=T, + plot = T, + lwd=2, + col="gray", + print.auc=T, + percent = T, + print.auc.y = 40, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 100, + ci = T) + +roc(new.model$test.observed_y, + new.model$test.y_prob$mid, + legacy.axes=T, + add = T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 50, + print.auc.x = 55, + lty = 1, + of = "se", + boot.n = 100, + ci = T) + +``` + + +## Compare RT models (high vs average RT) + +```{r comapre_rt_models} +f <- rt_cat ~ n + t + v +f <- rt_cat ~ n + v + s + tl + +splt <- split_data(f, seqs) + +ctrl <- trainControl(method="cv", + number=5, + classProbs=T, + verbose = T, + #sampling = "up", + savePredictions = T, + summaryFunction=twoClassSummary) + +max_components <- n_distinct(attr(terms(f),"term.labels")) - 2 + +# grid tune +tune <- expand.grid(ncomp=2:max_components) + +model1 <- train(splt$train.x, + splt$train.y, + method = "pls", + family = "binomial", + metric = "ROC", + preProc = c("nzv","center", "scale"), + verboseIter = TRUE, + #tuneLength = 7, + tuneGrid = tune, + trControl = ctrl) + +# validate model1 +splt$test.y <- model1 %>% predict(splt$test.x) +splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob") + +confusionMatrix(splt$test.y, splt$test.observed_y) +plot(varImp(model1, scale = F, useModel = F)) + +roc(splt$test.observed_y, + splt$test.y_prob$mid, + legacy.axes=T, + plot = T, + lwd=2, + col="black", + print.auc=T, + percent = T, + print.auc.y = 0, + print.auc.x = 80, + lty = 1, + of = "se", + boot.n = 200, + ci = T) + +``` + + + +```{r predict_rt} +``` \ No newline at end of file diff --git a/ccn2019/dummy-vars-playground.R b/ccn2019/dummy-vars-playground.R index 9bbe23a..14b4ce3 100644 --- a/ccn2019/dummy-vars-playground.R +++ b/ccn2019/dummy-vars-playground.R @@ -7,31 +7,28 @@ rm(seqs) load(here("notebooks/data/nback_seqs.Rd")) +seqs.raw <- seqs -# seqs %>% -# ggplot(aes(x=v,y=a,col=correct)) + -# geom_jitter() + -# geom_point(alpha=0.1) + -# geom_smooth() -f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll + stimulus -f <- correct ~ n + t + v + stimulus +seqs <- seqs.raw %>% + mutate( + rt_cat=cut(rt,breaks=c(-Inf,-1000,+500,+Inf), labels=c("low","mid","high")), + ) %>% + filter(rt_cat!="low") %>% + mutate( + rt_cat = droplevels(rt_cat) + ) %>% + drop_na(rt, rt_cat, correct, tl,sl) -set.seed(654321) - -# 1. dummy vars -# INPUTS : seqs -# OUTPUTS: seqs.dmy - -seqs <- seqs %>% - drop_na(rt, correct, tl,sl) - +f <- rt_cat ~ n + t + v +#f <- rt_cat ~ n + t + v + s + l + vl + sl + tl + ul + ll +#f <- correct ~ n + participant train.indices <- createDataPartition(seqs[[toString(f[[2]])]], p = .8, list =FALSE) -seqs.train.balanced <- seqs[train.indices,] -seqs.train <- seqs.train.balanced -# seqs.train <- ROSE(f, data = seqs.train.balanced)$data +seqs.train.imbalanced <- seqs[train.indices,] +seqs.train <- seqs.train.imbalanced +seqs.train <- ROSE(f, data = seqs.train.balanced)$data seqs.train.x <- model.matrix(f, seqs.train)[,-1] seqs.train.y <- seqs.train[[toString(f[[2]])]] @@ -42,7 +39,7 @@ # ROC for each var -filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) +#filterVarImp(as.data.frame(seqs.train.x), seqs.train.y) # model <- cv.glmnet(seqs.train.x, # seqs.train.y, @@ -57,26 +54,23 @@ number=1, classProbs=T, verbose = T, -# sampling = "up", + sampling = "up", savePredictions = T, summaryFunction=twoClassSummary) -# glmnet tune -tune <- expand.grid(alpha = 0:1, lambda = seq(0, 0.01, length = 100)) - -max_components <- n_distinct(attr(terms(f),"term.labels")) +max_components <- n_distinct(attr(terms(f),"term.labels")) -1 # pls tune -tune <- expand.grid(ncomp=1:max_components) +tune <- expand.grid(ncomp=2:max_components) model <- train(seqs.train.x, seqs.train.y, - method = "glmnet", - #family = "binomial", - #metric = "ROC", - preProc = c("nzv","center", "scale"), - #verboseIter = TRUE, + method = "pls", + family = "binomial", + metric = "ROC", + preProc = c("center", "scale"), + verboseIter = TRUE, tuneLength = 2, - #tuneGrid = tune, + tuneGrid = tune, trControl = ctrl) model$bestTune @@ -89,11 +83,8 @@ plot(varImp(model, scale = F, useModel = F)) -library(pROC) - - roc(seqs.test.observed_y, - seqs.test.y_prob$YES, + seqs.test.y_prob$mid, legacy.axes=T, plot = T, lwd=2, diff --git a/data/nback_seqs.Rd b/data/nback_seqs.Rd index 50e4b86..6ca2c1b 100644 --- a/data/nback_seqs.Rd +++ b/data/nback_seqs.Rd Binary files differ