diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index 0db19ef..e62332e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,14 +1,32 @@ --- output: html_document editor_options: - chunk_output_type: inline + chunk_output_type: console --- ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +options(repos=c(CRAN="http://cran.cnr.berkeley.edu/")) + #! =============================================== #! load required packages -# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) +# if(!require(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))){ +# install.packages(c("ggplot2", +# "tidyverse", +# "stringi", +# "pls", +# "caret", +# "here", +# "tsibble", +# "broom", +# "rsample", +# "inspectdf", +# "caTools", +# "pROC", +# "glmnet", +# "ROSE")) +# } + library(ggplot2) library(tidyverse) library(stringi) @@ -33,9 +51,17 @@ # Preprocessing + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + + +### Extract Lures The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -```{r} +```{r lures_function, include=FALSE} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -47,10 +73,6 @@ } ``` -After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. -The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. - -The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. ```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE} @@ -127,22 +149,24 @@ ```{r split_data} split_data <- function(f, data, .balance=F, p = 0.8) { - train.indices <- createDataPartition(data[[toString(f[[2]])]], + .train.indices <- createDataPartition(data[[toString(f[[2]])]], p = p, list =F) - .train.imbalanced <- data[train.indices,] + .train.imbalanced <- data[.train.indices,] - .train <- ifelse(.balance, - ROSE(f, data = .train.imbalanced)$data, - seqs.train.imbalanced) - - .train.x <- model.matrix(f, seqs.train)[,-1] + # .train <- ifelse(.balance==T, + # ROSE(f, data = .train.imbalanced)$data, + # .train.imbalanced) + .train <- .train.imbalanced - .train.y <- seqs.train[[toString(f[[2]])]] - .test <- seqs[-train.indices,] - .test.x <- model.matrix(f, seqs.test)[,-1] - .test.observed_y <- seqs.test[[toString(f[[2]])]] + .train.x <- model.matrix(f, .train)[,-1] + .train.y <- .train[[toString(f[[2]])]] + + .test <- data[-.train.indices,] + .test.x <- model.matrix(f, .test)[,-1] + .test.observed_y <- .test[[toString(f[[2]])]] + return(list( "train" = .train, "train.x" = .train.x, @@ -162,20 +186,35 @@ ```{r rfe_feature_selection} #f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll -#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat -f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat +f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat #f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat splt <- split_data(f, seqs) ctrl <- rfeControl(functions = rfFuncs, method = "cv", - number = 3, + number = 10, + rerank = T, verbose = T) -rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +#rfeProfile <- rfe(splt$train.x, splt$train.y, +rfeProfile <- rfe(f, seqs, + rfeControl = ctrl, + sizes = 2:10 + ) + + +rfeProfile$results %>% + as.data.frame() %>% + rownames_to_column() %>% + ggplot(aes(x = reorder(2:10, Accuracy), y = Accuracy)) + + geom_bar(stat = "identity", fill = "#1F77B4", alpha = 0.8) + + coord_flip() message("Optimal Variables: ") -rfeProfile$optVariables + +rfeProfile$metric # ROC for each categories filterVarImp(as.data.frame(splt$train.x), splt$train.y) diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index 0db19ef..e62332e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,14 +1,32 @@ --- output: html_document editor_options: - chunk_output_type: inline + chunk_output_type: console --- ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +options(repos=c(CRAN="http://cran.cnr.berkeley.edu/")) + #! =============================================== #! load required packages -# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) +# if(!require(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))){ +# install.packages(c("ggplot2", +# "tidyverse", +# "stringi", +# "pls", +# "caret", +# "here", +# "tsibble", +# "broom", +# "rsample", +# "inspectdf", +# "caTools", +# "pROC", +# "glmnet", +# "ROSE")) +# } + library(ggplot2) library(tidyverse) library(stringi) @@ -33,9 +51,17 @@ # Preprocessing + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + + +### Extract Lures The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -```{r} +```{r lures_function, include=FALSE} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -47,10 +73,6 @@ } ``` -After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. -The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. - -The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. ```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE} @@ -127,22 +149,24 @@ ```{r split_data} split_data <- function(f, data, .balance=F, p = 0.8) { - train.indices <- createDataPartition(data[[toString(f[[2]])]], + .train.indices <- createDataPartition(data[[toString(f[[2]])]], p = p, list =F) - .train.imbalanced <- data[train.indices,] + .train.imbalanced <- data[.train.indices,] - .train <- ifelse(.balance, - ROSE(f, data = .train.imbalanced)$data, - seqs.train.imbalanced) - - .train.x <- model.matrix(f, seqs.train)[,-1] + # .train <- ifelse(.balance==T, + # ROSE(f, data = .train.imbalanced)$data, + # .train.imbalanced) + .train <- .train.imbalanced - .train.y <- seqs.train[[toString(f[[2]])]] - .test <- seqs[-train.indices,] - .test.x <- model.matrix(f, seqs.test)[,-1] - .test.observed_y <- seqs.test[[toString(f[[2]])]] + .train.x <- model.matrix(f, .train)[,-1] + .train.y <- .train[[toString(f[[2]])]] + + .test <- data[-.train.indices,] + .test.x <- model.matrix(f, .test)[,-1] + .test.observed_y <- .test[[toString(f[[2]])]] + return(list( "train" = .train, "train.x" = .train.x, @@ -162,20 +186,35 @@ ```{r rfe_feature_selection} #f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll -#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat -f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat +f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat #f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat splt <- split_data(f, seqs) ctrl <- rfeControl(functions = rfFuncs, method = "cv", - number = 3, + number = 10, + rerank = T, verbose = T) -rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +#rfeProfile <- rfe(splt$train.x, splt$train.y, +rfeProfile <- rfe(f, seqs, + rfeControl = ctrl, + sizes = 2:10 + ) + + +rfeProfile$results %>% + as.data.frame() %>% + rownames_to_column() %>% + ggplot(aes(x = reorder(2:10, Accuracy), y = Accuracy)) + + geom_bar(stat = "identity", fill = "#1F77B4", alpha = 0.8) + + coord_flip() message("Optimal Variables: ") -rfeProfile$optVariables + +rfeProfile$metric # ROC for each categories filterVarImp(as.data.frame(splt$train.x), splt$train.y) diff --git a/ccn2019/ccn2019.rev3.html b/ccn2019/ccn2019.rev3.html new file mode 100644 index 0000000..fac97cb --- /dev/null +++ b/ccn2019/ccn2019.rev3.html @@ -0,0 +1,855 @@ + + + + +
+ + + + + + + + +After loading data set and marking lure trials, a sliding window of the size window_size
will be used to extract statistics including t
, l
, sl
, s
, u
, ul
, tl
, a
, al
for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.
The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.
+The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.
+The following chunk marks highly correlated predictors, but does not work anymore.
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+split_data()
is a function to split the cleaned up dataset into training and test. If .balance
parameter is set to TRUE
then the training set is oversampled with regard to the formula.
split_data <- function(f, data, .balance=F, p = 0.8) {
+ .train.indices <- createDataPartition(data[[toString(f[[2]])]],
+ p = p,
+ list =F)
+
+ .train.imbalanced <- data[.train.indices,]
+
+ # .train <- ifelse(.balance==T,
+ # ROSE(f, data = .train.imbalanced)$data,
+ # .train.imbalanced)
+ .train <- .train.imbalanced
+
+ .train.x <- model.matrix(f, .train)[,-1]
+ .train.y <- .train[[toString(f[[2]])]]
+
+ .test <- data[-.train.indices,]
+ .test.x <- model.matrix(f, .test)[,-1]
+ .test.observed_y <- .test[[toString(f[[2]])]]
+
+ return(list(
+ "train" = .train,
+ "train.x" = .train.x,
+ "train.y" = .train.y,
+ "test" = .test,
+ "test.x" = .test.x,
+ "test.observed_y" = .test.observed_y
+ ))
+}
+the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat
. To change parameters, modify the f
formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+ method = "cv",
+ number = 3,
+ rerank = T,
+ verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+## +(rfe) fit Fold1 size: 10
+## -(rfe) fit Fold1 size: 10
+## +(rfe) imp Fold1
+## -(rfe) imp Fold1
+## +(rfe) fit Fold1 size: 8
+## -(rfe) fit Fold1 size: 8
+## +(rfe) imp Fold1 size: 8
+## -(rfe) imp Fold1 size: 8
+## +(rfe) fit Fold1 size: 4
+## -(rfe) fit Fold1 size: 4
+## +(rfe) imp Fold1 size: 4
+## -(rfe) imp Fold1 size: 4
+## +(rfe) fit Fold2 size: 10
+## -(rfe) fit Fold2 size: 10
+## +(rfe) imp Fold2
+## -(rfe) imp Fold2
+## +(rfe) fit Fold2 size: 8
+## -(rfe) fit Fold2 size: 8
+## +(rfe) imp Fold2 size: 8
+## -(rfe) imp Fold2 size: 8
+## +(rfe) fit Fold2 size: 4
+## -(rfe) fit Fold2 size: 4
+## +(rfe) imp Fold2 size: 4
+## -(rfe) imp Fold2 size: 4
+## +(rfe) fit Fold3 size: 10
+## -(rfe) fit Fold3 size: 10
+## +(rfe) imp Fold3
+## -(rfe) imp Fold3
+## +(rfe) fit Fold3 size: 8
+## -(rfe) fit Fold3 size: 8
+## +(rfe) imp Fold3 size: 8
+## -(rfe) imp Fold3 size: 8
+## +(rfe) fit Fold3 size: 4
+## -(rfe) fit Fold3 size: 4
+## +(rfe) imp Fold3 size: 4
+## -(rfe) imp Fold3 size: 4
+message("Optimal Variables: ")
+## Optimal Variables:
+rfeProfile$optVariables
+## [1] "n" "tl" "vl" "sl"
+# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+## Overall
+## n 15.6157510
+## t 0.4033237
+## v 3.2527774
+## s 3.6791786
+## l 2.9241600
+## vl 0.1725283
+## sl 2.7719808
+## tl 4.8728706
+## ul 2.9101769
+## ll 3.1461070
+Correctness refers to the correct
column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).
The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.
+In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.
+Note: uncomment desired old.f
and new.f
formulas to run respective analysis.
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+train_model <- function(f, data, ctrl) {
+ splt <- split_data(f, data, .balance = T)
+ model <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ tuneLength = 5,
+ #tuneGrid = tune,
+ trControl = ctrl)
+ list(model = model,
+ test.y = model %>% predict(splt$test.x),
+ test.y_prob = model %>% predict(splt$test.x, type="prob"),
+ test.observed_y = splt$test.observed_y
+ )
+}
+
+old.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(old.f, ., ctrl)
+## + Fold1: ncomp=1
+## - Fold1: ncomp=1
+## + Fold2: ncomp=1
+## - Fold2: ncomp=1
+## + Fold3: ncomp=1
+## - Fold3: ncomp=1
+## + Fold4: ncomp=1
+## - Fold4: ncomp=1
+## + Fold5: ncomp=1
+## - Fold5: ncomp=1
+## Aggregating results
+## Fitting final model on full training set
+new.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(new.f, ., ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+confusionMatrix(old.model$test.y, old.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 93 771
+## YES 39 771
+##
+## Accuracy : 0.5161
+## 95% CI : (0.4919, 0.5403)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.0579
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.70455
+## Specificity : 0.50000
+## Pos Pred Value : 0.10764
+## Neg Pred Value : 0.95185
+## Prevalence : 0.07885
+## Detection Rate : 0.05556
+## Detection Prevalence : 0.51613
+## Balanced Accuracy : 0.60227
+##
+## 'Positive' Class : NO
+##
+plot(varImp(old.model$model, scale = F, useModel = F))
+confusionMatrix(new.model$test.y, new.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 103 835
+## YES 29 707
+##
+## Accuracy : 0.4839
+## 95% CI : (0.4597, 0.5081)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.063
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.78030
+## Specificity : 0.45850
+## Pos Pred Value : 0.10981
+## Neg Pred Value : 0.96060
+## Prevalence : 0.07885
+## Detection Rate : 0.06153
+## Detection Prevalence : 0.56033
+## Balanced Accuracy : 0.61940
+##
+## 'Positive' Class : NO
+##
+plot(varImp(new.model$model, scale = F, useModel = F))
+roc(old.model$test.observed_y,
+ old.model$test.y_prob$YES,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="gray",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 40,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.95 92.93 93.60
+## 20 83.90 85.86 87.19
+## 30 75.86 78.79 80.79
+## 40 67.81 71.72 74.39
+## 50 59.76 64.66 67.99
+## 60 51.31 57.59 61.58
+## 70 39.66 50.52 55.18
+## 80 26.44 34.42 44.81
+## 90 13.22 17.21 22.41
+## 100 0.00 0.00 0.00
+roc(new.model$test.observed_y,
+ new.model$test.y_prob$YES,
+ legacy.axes=T,
+ add = T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 50,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, add = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 50, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 95.16 96.29 97.19
+## 20 89.97 92.58 94.37
+## 30 82.85 87.98 91.53
+## 40 74.06 79.70 85.61
+## 50 65.48 70.92 76.26
+## 60 56.18 62.02 67.40
+## 70 45.65 53.38 58.83
+## 80 30.57 42.77 50.22
+## 90 15.29 21.39 30.26
+## 100 0.00 0.00 0.00
+f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ #sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ #tuneLength = 7,
+ tuneGrid = tune,
+ trControl = ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Fitting final model on full training set
+# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction mid high
+## mid 1435 240
+## high 0 0
+##
+## Accuracy : 0.8567
+## 95% CI : (0.839, 0.8732)
+## No Information Rate : 0.8567
+## P-Value [Acc > NIR] : 0.5172
+##
+## Kappa : 0
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 1.0000
+## Specificity : 0.0000
+## Pos Pred Value : 0.8567
+## Neg Pred Value : NaN
+## Prevalence : 0.8567
+## Detection Rate : 0.8567
+## Detection Prevalence : 1.0000
+## Balanced Accuracy : 0.5000
+##
+## 'Positive' Class : mid
+##
+plot(varImp(model1, scale = F, useModel = F))
+roc(splt$test.observed_y,
+ splt$test.y_prob$mid,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 0,
+ print.auc.x = 80,
+ lty = 1,
+ of = "se",
+ boot.n = 200,
+ ci = T)
+## Setting levels: control = mid, case = high
+## Setting direction: controls > cases
+##
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80, lty = 1, of = "se", boot.n = 200)
+##
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.25 94.21 96.68
+## 20 86.25 90.00 93.34
+## 30 82.08 86.04 90.48
+## 40 72.68 79.58 85.83
+## 50 67.08 72.92 78.76
+## 60 58.75 66.46 74.60
+## 70 42.91 51.25 58.94
+## 80 27.07 34.17 40.43
+## 90 12.17 16.71 21.67
+## 100 0.00 0.00 0.00
+After loading data set and marking lure trials, a sliding window of the size window_size
will be used to extract statistics including t
, l
, sl
, s
, u
, ul
, tl
, a
, al
for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.
The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.
+The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.
+The following chunk marks highly correlated predictors, but does not work anymore.
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+split_data()
is a function to split the cleaned up dataset into training and test. If .balance
parameter is set to TRUE
then the training set is oversampled with regard to the formula.
split_data <- function(f, data, .balance=F, p = 0.8) {
+ .train.indices <- createDataPartition(data[[toString(f[[2]])]],
+ p = p,
+ list =F)
+
+ .train.imbalanced <- data[.train.indices,]
+
+ # .train <- ifelse(.balance==T,
+ # ROSE(f, data = .train.imbalanced)$data,
+ # .train.imbalanced)
+ .train <- .train.imbalanced
+
+ .train.x <- model.matrix(f, .train)[,-1]
+ .train.y <- .train[[toString(f[[2]])]]
+
+ .test <- data[-.train.indices,]
+ .test.x <- model.matrix(f, .test)[,-1]
+ .test.observed_y <- .test[[toString(f[[2]])]]
+
+ return(list(
+ "train" = .train,
+ "train.x" = .train.x,
+ "train.y" = .train.y,
+ "test" = .test,
+ "test.x" = .test.x,
+ "test.observed_y" = .test.observed_y
+ ))
+}
+the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat
. To change parameters, modify the f
formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+ method = "cv",
+ number = 3,
+ rerank = T,
+ verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+## +(rfe) fit Fold1 size: 10
+## -(rfe) fit Fold1 size: 10
+## +(rfe) imp Fold1
+## -(rfe) imp Fold1
+## +(rfe) fit Fold1 size: 8
+## -(rfe) fit Fold1 size: 8
+## +(rfe) imp Fold1 size: 8
+## -(rfe) imp Fold1 size: 8
+## +(rfe) fit Fold1 size: 4
+## -(rfe) fit Fold1 size: 4
+## +(rfe) imp Fold1 size: 4
+## -(rfe) imp Fold1 size: 4
+## +(rfe) fit Fold2 size: 10
+## -(rfe) fit Fold2 size: 10
+## +(rfe) imp Fold2
+## -(rfe) imp Fold2
+## +(rfe) fit Fold2 size: 8
+## -(rfe) fit Fold2 size: 8
+## +(rfe) imp Fold2 size: 8
+## -(rfe) imp Fold2 size: 8
+## +(rfe) fit Fold2 size: 4
+## -(rfe) fit Fold2 size: 4
+## +(rfe) imp Fold2 size: 4
+## -(rfe) imp Fold2 size: 4
+## +(rfe) fit Fold3 size: 10
+## -(rfe) fit Fold3 size: 10
+## +(rfe) imp Fold3
+## -(rfe) imp Fold3
+## +(rfe) fit Fold3 size: 8
+## -(rfe) fit Fold3 size: 8
+## +(rfe) imp Fold3 size: 8
+## -(rfe) imp Fold3 size: 8
+## +(rfe) fit Fold3 size: 4
+## -(rfe) fit Fold3 size: 4
+## +(rfe) imp Fold3 size: 4
+## -(rfe) imp Fold3 size: 4
+message("Optimal Variables: ")
+## Optimal Variables:
+rfeProfile$optVariables
+## [1] "n" "tl" "vl" "sl"
+# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+## Overall
+## n 15.6157510
+## t 0.4033237
+## v 3.2527774
+## s 3.6791786
+## l 2.9241600
+## vl 0.1725283
+## sl 2.7719808
+## tl 4.8728706
+## ul 2.9101769
+## ll 3.1461070
+Correctness refers to the correct
column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).
The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.
+In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.
+Note: uncomment desired old.f
and new.f
formulas to run respective analysis.
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+train_model <- function(f, data, ctrl) {
+ splt <- split_data(f, data, .balance = T)
+ model <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ tuneLength = 5,
+ #tuneGrid = tune,
+ trControl = ctrl)
+ list(model = model,
+ test.y = model %>% predict(splt$test.x),
+ test.y_prob = model %>% predict(splt$test.x, type="prob"),
+ test.observed_y = splt$test.observed_y
+ )
+}
+
+old.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(old.f, ., ctrl)
+## + Fold1: ncomp=1
+## - Fold1: ncomp=1
+## + Fold2: ncomp=1
+## - Fold2: ncomp=1
+## + Fold3: ncomp=1
+## - Fold3: ncomp=1
+## + Fold4: ncomp=1
+## - Fold4: ncomp=1
+## + Fold5: ncomp=1
+## - Fold5: ncomp=1
+## Aggregating results
+## Fitting final model on full training set
+new.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(new.f, ., ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+confusionMatrix(old.model$test.y, old.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 93 771
+## YES 39 771
+##
+## Accuracy : 0.5161
+## 95% CI : (0.4919, 0.5403)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.0579
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.70455
+## Specificity : 0.50000
+## Pos Pred Value : 0.10764
+## Neg Pred Value : 0.95185
+## Prevalence : 0.07885
+## Detection Rate : 0.05556
+## Detection Prevalence : 0.51613
+## Balanced Accuracy : 0.60227
+##
+## 'Positive' Class : NO
+##
+plot(varImp(old.model$model, scale = F, useModel = F))
+confusionMatrix(new.model$test.y, new.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 103 835
+## YES 29 707
+##
+## Accuracy : 0.4839
+## 95% CI : (0.4597, 0.5081)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.063
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.78030
+## Specificity : 0.45850
+## Pos Pred Value : 0.10981
+## Neg Pred Value : 0.96060
+## Prevalence : 0.07885
+## Detection Rate : 0.06153
+## Detection Prevalence : 0.56033
+## Balanced Accuracy : 0.61940
+##
+## 'Positive' Class : NO
+##
+plot(varImp(new.model$model, scale = F, useModel = F))
+roc(old.model$test.observed_y,
+ old.model$test.y_prob$YES,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="gray",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 40,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.95 92.93 93.60
+## 20 83.90 85.86 87.19
+## 30 75.86 78.79 80.79
+## 40 67.81 71.72 74.39
+## 50 59.76 64.66 67.99
+## 60 51.31 57.59 61.58
+## 70 39.66 50.52 55.18
+## 80 26.44 34.42 44.81
+## 90 13.22 17.21 22.41
+## 100 0.00 0.00 0.00
+roc(new.model$test.observed_y,
+ new.model$test.y_prob$YES,
+ legacy.axes=T,
+ add = T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 50,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, add = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 50, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 95.16 96.29 97.19
+## 20 89.97 92.58 94.37
+## 30 82.85 87.98 91.53
+## 40 74.06 79.70 85.61
+## 50 65.48 70.92 76.26
+## 60 56.18 62.02 67.40
+## 70 45.65 53.38 58.83
+## 80 30.57 42.77 50.22
+## 90 15.29 21.39 30.26
+## 100 0.00 0.00 0.00
+f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ #sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ #tuneLength = 7,
+ tuneGrid = tune,
+ trControl = ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Fitting final model on full training set
+# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction mid high
+## mid 1435 240
+## high 0 0
+##
+## Accuracy : 0.8567
+## 95% CI : (0.839, 0.8732)
+## No Information Rate : 0.8567
+## P-Value [Acc > NIR] : 0.5172
+##
+## Kappa : 0
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 1.0000
+## Specificity : 0.0000
+## Pos Pred Value : 0.8567
+## Neg Pred Value : NaN
+## Prevalence : 0.8567
+## Detection Rate : 0.8567
+## Detection Prevalence : 1.0000
+## Balanced Accuracy : 0.5000
+##
+## 'Positive' Class : mid
+##
+plot(varImp(model1, scale = F, useModel = F))
+roc(splt$test.observed_y,
+ splt$test.y_prob$mid,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 0,
+ print.auc.x = 80,
+ lty = 1,
+ of = "se",
+ boot.n = 200,
+ ci = T)
+## Setting levels: control = mid, case = high
+## Setting direction: controls > cases
+##
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80, lty = 1, of = "se", boot.n = 200)
+##
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.25 94.21 96.68
+## 20 86.25 90.00 93.34
+## 30 82.08 86.04 90.48
+## 40 72.68 79.58 85.83
+## 50 67.08 72.92 78.76
+## 60 58.75 66.46 74.60
+## 70 42.91 51.25 58.94
+## 80 27.07 34.17 40.43
+## 90 12.17 16.71 21.67
+## 100 0.00 0.00 0.00
+After loading data set and marking lure trials, a sliding window of the size window_size
will be used to extract statistics including t
, l
, sl
, s
, u
, ul
, tl
, a
, al
for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.
The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.
+The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.
+The following chunk marks highly correlated predictors, but does not work anymore.
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+split_data()
is a function to split the cleaned up dataset into training and test. If .balance
parameter is set to TRUE
then the training set is oversampled with regard to the formula.
split_data <- function(f, data, .balance=F, p = 0.8) {
+ .train.indices <- createDataPartition(data[[toString(f[[2]])]],
+ p = p,
+ list =F)
+
+ .train.imbalanced <- data[.train.indices,]
+
+ # .train <- ifelse(.balance==T,
+ # ROSE(f, data = .train.imbalanced)$data,
+ # .train.imbalanced)
+ .train <- .train.imbalanced
+
+ .train.x <- model.matrix(f, .train)[,-1]
+ .train.y <- .train[[toString(f[[2]])]]
+
+ .test <- data[-.train.indices,]
+ .test.x <- model.matrix(f, .test)[,-1]
+ .test.observed_y <- .test[[toString(f[[2]])]]
+
+ return(list(
+ "train" = .train,
+ "train.x" = .train.x,
+ "train.y" = .train.y,
+ "test" = .test,
+ "test.x" = .test.x,
+ "test.observed_y" = .test.observed_y
+ ))
+}
+the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat
. To change parameters, modify the f
formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+ method = "cv",
+ number = 3,
+ rerank = T,
+ verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+## +(rfe) fit Fold1 size: 10
+## -(rfe) fit Fold1 size: 10
+## +(rfe) imp Fold1
+## -(rfe) imp Fold1
+## +(rfe) fit Fold1 size: 8
+## -(rfe) fit Fold1 size: 8
+## +(rfe) imp Fold1 size: 8
+## -(rfe) imp Fold1 size: 8
+## +(rfe) fit Fold1 size: 4
+## -(rfe) fit Fold1 size: 4
+## +(rfe) imp Fold1 size: 4
+## -(rfe) imp Fold1 size: 4
+## +(rfe) fit Fold2 size: 10
+## -(rfe) fit Fold2 size: 10
+## +(rfe) imp Fold2
+## -(rfe) imp Fold2
+## +(rfe) fit Fold2 size: 8
+## -(rfe) fit Fold2 size: 8
+## +(rfe) imp Fold2 size: 8
+## -(rfe) imp Fold2 size: 8
+## +(rfe) fit Fold2 size: 4
+## -(rfe) fit Fold2 size: 4
+## +(rfe) imp Fold2 size: 4
+## -(rfe) imp Fold2 size: 4
+## +(rfe) fit Fold3 size: 10
+## -(rfe) fit Fold3 size: 10
+## +(rfe) imp Fold3
+## -(rfe) imp Fold3
+## +(rfe) fit Fold3 size: 8
+## -(rfe) fit Fold3 size: 8
+## +(rfe) imp Fold3 size: 8
+## -(rfe) imp Fold3 size: 8
+## +(rfe) fit Fold3 size: 4
+## -(rfe) fit Fold3 size: 4
+## +(rfe) imp Fold3 size: 4
+## -(rfe) imp Fold3 size: 4
+message("Optimal Variables: ")
+## Optimal Variables:
+rfeProfile$optVariables
+## [1] "n" "tl" "vl" "sl"
+# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+## Overall
+## n 15.6157510
+## t 0.4033237
+## v 3.2527774
+## s 3.6791786
+## l 2.9241600
+## vl 0.1725283
+## sl 2.7719808
+## tl 4.8728706
+## ul 2.9101769
+## ll 3.1461070
+Correctness refers to the correct
column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).
The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.
+In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.
+Note: uncomment desired old.f
and new.f
formulas to run respective analysis.
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+train_model <- function(f, data, ctrl) {
+ splt <- split_data(f, data, .balance = T)
+ model <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ tuneLength = 5,
+ #tuneGrid = tune,
+ trControl = ctrl)
+ list(model = model,
+ test.y = model %>% predict(splt$test.x),
+ test.y_prob = model %>% predict(splt$test.x, type="prob"),
+ test.observed_y = splt$test.observed_y
+ )
+}
+
+old.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(old.f, ., ctrl)
+## + Fold1: ncomp=1
+## - Fold1: ncomp=1
+## + Fold2: ncomp=1
+## - Fold2: ncomp=1
+## + Fold3: ncomp=1
+## - Fold3: ncomp=1
+## + Fold4: ncomp=1
+## - Fold4: ncomp=1
+## + Fold5: ncomp=1
+## - Fold5: ncomp=1
+## Aggregating results
+## Fitting final model on full training set
+new.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(new.f, ., ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+confusionMatrix(old.model$test.y, old.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 93 771
+## YES 39 771
+##
+## Accuracy : 0.5161
+## 95% CI : (0.4919, 0.5403)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.0579
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.70455
+## Specificity : 0.50000
+## Pos Pred Value : 0.10764
+## Neg Pred Value : 0.95185
+## Prevalence : 0.07885
+## Detection Rate : 0.05556
+## Detection Prevalence : 0.51613
+## Balanced Accuracy : 0.60227
+##
+## 'Positive' Class : NO
+##
+plot(varImp(old.model$model, scale = F, useModel = F))
+confusionMatrix(new.model$test.y, new.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 103 835
+## YES 29 707
+##
+## Accuracy : 0.4839
+## 95% CI : (0.4597, 0.5081)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.063
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.78030
+## Specificity : 0.45850
+## Pos Pred Value : 0.10981
+## Neg Pred Value : 0.96060
+## Prevalence : 0.07885
+## Detection Rate : 0.06153
+## Detection Prevalence : 0.56033
+## Balanced Accuracy : 0.61940
+##
+## 'Positive' Class : NO
+##
+plot(varImp(new.model$model, scale = F, useModel = F))
+roc(old.model$test.observed_y,
+ old.model$test.y_prob$YES,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="gray",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 40,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.95 92.93 93.60
+## 20 83.90 85.86 87.19
+## 30 75.86 78.79 80.79
+## 40 67.81 71.72 74.39
+## 50 59.76 64.66 67.99
+## 60 51.31 57.59 61.58
+## 70 39.66 50.52 55.18
+## 80 26.44 34.42 44.81
+## 90 13.22 17.21 22.41
+## 100 0.00 0.00 0.00
+roc(new.model$test.observed_y,
+ new.model$test.y_prob$YES,
+ legacy.axes=T,
+ add = T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 50,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, add = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 50, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 95.16 96.29 97.19
+## 20 89.97 92.58 94.37
+## 30 82.85 87.98 91.53
+## 40 74.06 79.70 85.61
+## 50 65.48 70.92 76.26
+## 60 56.18 62.02 67.40
+## 70 45.65 53.38 58.83
+## 80 30.57 42.77 50.22
+## 90 15.29 21.39 30.26
+## 100 0.00 0.00 0.00
+f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ #sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ #tuneLength = 7,
+ tuneGrid = tune,
+ trControl = ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Fitting final model on full training set
+# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction mid high
+## mid 1435 240
+## high 0 0
+##
+## Accuracy : 0.8567
+## 95% CI : (0.839, 0.8732)
+## No Information Rate : 0.8567
+## P-Value [Acc > NIR] : 0.5172
+##
+## Kappa : 0
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 1.0000
+## Specificity : 0.0000
+## Pos Pred Value : 0.8567
+## Neg Pred Value : NaN
+## Prevalence : 0.8567
+## Detection Rate : 0.8567
+## Detection Prevalence : 1.0000
+## Balanced Accuracy : 0.5000
+##
+## 'Positive' Class : mid
+##
+plot(varImp(model1, scale = F, useModel = F))
+roc(splt$test.observed_y,
+ splt$test.y_prob$mid,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 0,
+ print.auc.x = 80,
+ lty = 1,
+ of = "se",
+ boot.n = 200,
+ ci = T)
+## Setting levels: control = mid, case = high
+## Setting direction: controls > cases
+##
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80, lty = 1, of = "se", boot.n = 200)
+##
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.25 94.21 96.68
+## 20 86.25 90.00 93.34
+## 30 82.08 86.04 90.48
+## 40 72.68 79.58 85.83
+## 50 67.08 72.92 78.76
+## 60 58.75 66.46 74.60
+## 70 42.91 51.25 58.94
+## 80 27.07 34.17 40.43
+## 90 12.17 16.71 21.67
+## 100 0.00 0.00 0.00
+After loading data set and marking lure trials, a sliding window of the size window_size
will be used to extract statistics including t
, l
, sl
, s
, u
, ul
, tl
, a
, al
for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.
The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.
+The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.
+The following chunk marks highly correlated predictors, but does not work anymore.
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+split_data()
is a function to split the cleaned up dataset into training and test. If .balance
parameter is set to TRUE
then the training set is oversampled with regard to the formula.
split_data <- function(f, data, .balance=F, p = 0.8) {
+ .train.indices <- createDataPartition(data[[toString(f[[2]])]],
+ p = p,
+ list =F)
+
+ .train.imbalanced <- data[.train.indices,]
+
+ # .train <- ifelse(.balance==T,
+ # ROSE(f, data = .train.imbalanced)$data,
+ # .train.imbalanced)
+ .train <- .train.imbalanced
+
+ .train.x <- model.matrix(f, .train)[,-1]
+ .train.y <- .train[[toString(f[[2]])]]
+
+ .test <- data[-.train.indices,]
+ .test.x <- model.matrix(f, .test)[,-1]
+ .test.observed_y <- .test[[toString(f[[2]])]]
+
+ return(list(
+ "train" = .train,
+ "train.x" = .train.x,
+ "train.y" = .train.y,
+ "test" = .test,
+ "test.x" = .test.x,
+ "test.observed_y" = .test.observed_y
+ ))
+}
+the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat
. To change parameters, modify the f
formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+ method = "cv",
+ number = 3,
+ rerank = T,
+ verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+## +(rfe) fit Fold1 size: 10
+## -(rfe) fit Fold1 size: 10
+## +(rfe) imp Fold1
+## -(rfe) imp Fold1
+## +(rfe) fit Fold1 size: 8
+## -(rfe) fit Fold1 size: 8
+## +(rfe) imp Fold1 size: 8
+## -(rfe) imp Fold1 size: 8
+## +(rfe) fit Fold1 size: 4
+## -(rfe) fit Fold1 size: 4
+## +(rfe) imp Fold1 size: 4
+## -(rfe) imp Fold1 size: 4
+## +(rfe) fit Fold2 size: 10
+## -(rfe) fit Fold2 size: 10
+## +(rfe) imp Fold2
+## -(rfe) imp Fold2
+## +(rfe) fit Fold2 size: 8
+## -(rfe) fit Fold2 size: 8
+## +(rfe) imp Fold2 size: 8
+## -(rfe) imp Fold2 size: 8
+## +(rfe) fit Fold2 size: 4
+## -(rfe) fit Fold2 size: 4
+## +(rfe) imp Fold2 size: 4
+## -(rfe) imp Fold2 size: 4
+## +(rfe) fit Fold3 size: 10
+## -(rfe) fit Fold3 size: 10
+## +(rfe) imp Fold3
+## -(rfe) imp Fold3
+## +(rfe) fit Fold3 size: 8
+## -(rfe) fit Fold3 size: 8
+## +(rfe) imp Fold3 size: 8
+## -(rfe) imp Fold3 size: 8
+## +(rfe) fit Fold3 size: 4
+## -(rfe) fit Fold3 size: 4
+## +(rfe) imp Fold3 size: 4
+## -(rfe) imp Fold3 size: 4
+message("Optimal Variables: ")
+## Optimal Variables:
+rfeProfile$optVariables
+## [1] "n" "tl" "vl" "sl"
+# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+## Overall
+## n 15.6157510
+## t 0.4033237
+## v 3.2527774
+## s 3.6791786
+## l 2.9241600
+## vl 0.1725283
+## sl 2.7719808
+## tl 4.8728706
+## ul 2.9101769
+## ll 3.1461070
+Correctness refers to the correct
column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).
The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.
+In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.
+Note: uncomment desired old.f
and new.f
formulas to run respective analysis.
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+train_model <- function(f, data, ctrl) {
+ splt <- split_data(f, data, .balance = T)
+ model <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ tuneLength = 5,
+ #tuneGrid = tune,
+ trControl = ctrl)
+ list(model = model,
+ test.y = model %>% predict(splt$test.x),
+ test.y_prob = model %>% predict(splt$test.x, type="prob"),
+ test.observed_y = splt$test.observed_y
+ )
+}
+
+old.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(old.f, ., ctrl)
+## + Fold1: ncomp=1
+## - Fold1: ncomp=1
+## + Fold2: ncomp=1
+## - Fold2: ncomp=1
+## + Fold3: ncomp=1
+## - Fold3: ncomp=1
+## + Fold4: ncomp=1
+## - Fold4: ncomp=1
+## + Fold5: ncomp=1
+## - Fold5: ncomp=1
+## Aggregating results
+## Fitting final model on full training set
+new.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(new.f, ., ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+confusionMatrix(old.model$test.y, old.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 93 771
+## YES 39 771
+##
+## Accuracy : 0.5161
+## 95% CI : (0.4919, 0.5403)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.0579
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.70455
+## Specificity : 0.50000
+## Pos Pred Value : 0.10764
+## Neg Pred Value : 0.95185
+## Prevalence : 0.07885
+## Detection Rate : 0.05556
+## Detection Prevalence : 0.51613
+## Balanced Accuracy : 0.60227
+##
+## 'Positive' Class : NO
+##
+plot(varImp(old.model$model, scale = F, useModel = F))
+confusionMatrix(new.model$test.y, new.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 103 835
+## YES 29 707
+##
+## Accuracy : 0.4839
+## 95% CI : (0.4597, 0.5081)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.063
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.78030
+## Specificity : 0.45850
+## Pos Pred Value : 0.10981
+## Neg Pred Value : 0.96060
+## Prevalence : 0.07885
+## Detection Rate : 0.06153
+## Detection Prevalence : 0.56033
+## Balanced Accuracy : 0.61940
+##
+## 'Positive' Class : NO
+##
+plot(varImp(new.model$model, scale = F, useModel = F))
+roc(old.model$test.observed_y,
+ old.model$test.y_prob$YES,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="gray",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 40,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.95 92.93 93.60
+## 20 83.90 85.86 87.19
+## 30 75.86 78.79 80.79
+## 40 67.81 71.72 74.39
+## 50 59.76 64.66 67.99
+## 60 51.31 57.59 61.58
+## 70 39.66 50.52 55.18
+## 80 26.44 34.42 44.81
+## 90 13.22 17.21 22.41
+## 100 0.00 0.00 0.00
+roc(new.model$test.observed_y,
+ new.model$test.y_prob$YES,
+ legacy.axes=T,
+ add = T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 50,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, add = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 50, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 95.16 96.29 97.19
+## 20 89.97 92.58 94.37
+## 30 82.85 87.98 91.53
+## 40 74.06 79.70 85.61
+## 50 65.48 70.92 76.26
+## 60 56.18 62.02 67.40
+## 70 45.65 53.38 58.83
+## 80 30.57 42.77 50.22
+## 90 15.29 21.39 30.26
+## 100 0.00 0.00 0.00
+f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ #sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ #tuneLength = 7,
+ tuneGrid = tune,
+ trControl = ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Fitting final model on full training set
+# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction mid high
+## mid 1435 240
+## high 0 0
+##
+## Accuracy : 0.8567
+## 95% CI : (0.839, 0.8732)
+## No Information Rate : 0.8567
+## P-Value [Acc > NIR] : 0.5172
+##
+## Kappa : 0
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 1.0000
+## Specificity : 0.0000
+## Pos Pred Value : 0.8567
+## Neg Pred Value : NaN
+## Prevalence : 0.8567
+## Detection Rate : 0.8567
+## Detection Prevalence : 1.0000
+## Balanced Accuracy : 0.5000
+##
+## 'Positive' Class : mid
+##
+plot(varImp(model1, scale = F, useModel = F))
+roc(splt$test.observed_y,
+ splt$test.y_prob$mid,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 0,
+ print.auc.x = 80,
+ lty = 1,
+ of = "se",
+ boot.n = 200,
+ ci = T)
+## Setting levels: control = mid, case = high
+## Setting direction: controls > cases
+##
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80, lty = 1, of = "se", boot.n = 200)
+##
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.25 94.21 96.68
+## 20 86.25 90.00 93.34
+## 30 82.08 86.04 90.48
+## 40 72.68 79.58 85.83
+## 50 67.08 72.92 78.76
+## 60 58.75 66.46 74.60
+## 70 42.91 51.25 58.94
+## 80 27.07 34.17 40.43
+## 90 12.17 16.71 21.67
+## 100 0.00 0.00 0.00
+After loading data set and marking lure trials, a sliding window of the size window_size
will be used to extract statistics including t
, l
, sl
, s
, u
, ul
, tl
, a
, al
for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.
The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.
+The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.
+The following chunk marks highly correlated predictors, but does not work anymore.
+# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+split_data()
is a function to split the cleaned up dataset into training and test. If .balance
parameter is set to TRUE
then the training set is oversampled with regard to the formula.
split_data <- function(f, data, .balance=F, p = 0.8) {
+ .train.indices <- createDataPartition(data[[toString(f[[2]])]],
+ p = p,
+ list =F)
+
+ .train.imbalanced <- data[.train.indices,]
+
+ # .train <- ifelse(.balance==T,
+ # ROSE(f, data = .train.imbalanced)$data,
+ # .train.imbalanced)
+ .train <- .train.imbalanced
+
+ .train.x <- model.matrix(f, .train)[,-1]
+ .train.y <- .train[[toString(f[[2]])]]
+
+ .test <- data[-.train.indices,]
+ .test.x <- model.matrix(f, .test)[,-1]
+ .test.observed_y <- .test[[toString(f[[2]])]]
+
+ return(list(
+ "train" = .train,
+ "train.x" = .train.x,
+ "train.y" = .train.y,
+ "test" = .test,
+ "test.x" = .test.x,
+ "test.observed_y" = .test.observed_y
+ ))
+}
+the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat
. To change parameters, modify the f
formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+ method = "cv",
+ number = 3,
+ rerank = T,
+ verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+## +(rfe) fit Fold1 size: 10
+## -(rfe) fit Fold1 size: 10
+## +(rfe) imp Fold1
+## -(rfe) imp Fold1
+## +(rfe) fit Fold1 size: 8
+## -(rfe) fit Fold1 size: 8
+## +(rfe) imp Fold1 size: 8
+## -(rfe) imp Fold1 size: 8
+## +(rfe) fit Fold1 size: 4
+## -(rfe) fit Fold1 size: 4
+## +(rfe) imp Fold1 size: 4
+## -(rfe) imp Fold1 size: 4
+## +(rfe) fit Fold2 size: 10
+## -(rfe) fit Fold2 size: 10
+## +(rfe) imp Fold2
+## -(rfe) imp Fold2
+## +(rfe) fit Fold2 size: 8
+## -(rfe) fit Fold2 size: 8
+## +(rfe) imp Fold2 size: 8
+## -(rfe) imp Fold2 size: 8
+## +(rfe) fit Fold2 size: 4
+## -(rfe) fit Fold2 size: 4
+## +(rfe) imp Fold2 size: 4
+## -(rfe) imp Fold2 size: 4
+## +(rfe) fit Fold3 size: 10
+## -(rfe) fit Fold3 size: 10
+## +(rfe) imp Fold3
+## -(rfe) imp Fold3
+## +(rfe) fit Fold3 size: 8
+## -(rfe) fit Fold3 size: 8
+## +(rfe) imp Fold3 size: 8
+## -(rfe) imp Fold3 size: 8
+## +(rfe) fit Fold3 size: 4
+## -(rfe) fit Fold3 size: 4
+## +(rfe) imp Fold3 size: 4
+## -(rfe) imp Fold3 size: 4
+message("Optimal Variables: ")
+## Optimal Variables:
+rfeProfile$optVariables
+## [1] "n" "tl" "vl" "sl"
+# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+## Overall
+## n 15.6157510
+## t 0.4033237
+## v 3.2527774
+## s 3.6791786
+## l 2.9241600
+## vl 0.1725283
+## sl 2.7719808
+## tl 4.8728706
+## ul 2.9101769
+## ll 3.1461070
+Correctness refers to the correct
column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).
The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.
+In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.
+Note: uncomment desired old.f
and new.f
formulas to run respective analysis.
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+train_model <- function(f, data, ctrl) {
+ splt <- split_data(f, data, .balance = T)
+ model <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ tuneLength = 5,
+ #tuneGrid = tune,
+ trControl = ctrl)
+ list(model = model,
+ test.y = model %>% predict(splt$test.x),
+ test.y_prob = model %>% predict(splt$test.x, type="prob"),
+ test.observed_y = splt$test.observed_y
+ )
+}
+
+old.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(old.f, ., ctrl)
+## + Fold1: ncomp=1
+## - Fold1: ncomp=1
+## + Fold2: ncomp=1
+## - Fold2: ncomp=1
+## + Fold3: ncomp=1
+## - Fold3: ncomp=1
+## + Fold4: ncomp=1
+## - Fold4: ncomp=1
+## + Fold5: ncomp=1
+## - Fold5: ncomp=1
+## Aggregating results
+## Fitting final model on full training set
+new.model <- seqs %>%
+ #filter(rt_cat == "high") %>%
+ train_model(new.f, ., ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+confusionMatrix(old.model$test.y, old.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 93 771
+## YES 39 771
+##
+## Accuracy : 0.5161
+## 95% CI : (0.4919, 0.5403)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.0579
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.70455
+## Specificity : 0.50000
+## Pos Pred Value : 0.10764
+## Neg Pred Value : 0.95185
+## Prevalence : 0.07885
+## Detection Rate : 0.05556
+## Detection Prevalence : 0.51613
+## Balanced Accuracy : 0.60227
+##
+## 'Positive' Class : NO
+##
+plot(varImp(old.model$model, scale = F, useModel = F))
+confusionMatrix(new.model$test.y, new.model$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction NO YES
+## NO 103 835
+## YES 29 707
+##
+## Accuracy : 0.4839
+## 95% CI : (0.4597, 0.5081)
+## No Information Rate : 0.9211
+## P-Value [Acc > NIR] : 1
+##
+## Kappa : 0.063
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 0.78030
+## Specificity : 0.45850
+## Pos Pred Value : 0.10981
+## Neg Pred Value : 0.96060
+## Prevalence : 0.07885
+## Detection Rate : 0.06153
+## Detection Prevalence : 0.56033
+## Balanced Accuracy : 0.61940
+##
+## 'Positive' Class : NO
+##
+plot(varImp(new.model$model, scale = F, useModel = F))
+roc(old.model$test.observed_y,
+ old.model$test.y_prob$YES,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="gray",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 40,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.95 92.93 93.60
+## 20 83.90 85.86 87.19
+## 30 75.86 78.79 80.79
+## 40 67.81 71.72 74.39
+## 50 59.76 64.66 67.99
+## 60 51.31 57.59 61.58
+## 70 39.66 50.52 55.18
+## 80 26.44 34.42 44.81
+## 90 13.22 17.21 22.41
+## 100 0.00 0.00 0.00
+roc(new.model$test.observed_y,
+ new.model$test.y_prob$YES,
+ legacy.axes=T,
+ add = T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 50,
+ print.auc.x = 55,
+ lty = 1,
+ of = "se",
+ boot.n = 100,
+ ci = T)
+## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+##
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES, percent = T, ci = T, plot = T, legacy.axes = T, add = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 50, print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+##
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 95.16 96.29 97.19
+## 20 89.97 92.58 94.37
+## 30 82.85 87.98 91.53
+## 40 74.06 79.70 85.61
+## 50 65.48 70.92 76.26
+## 60 56.18 62.02 67.40
+## 70 45.65 53.38 58.83
+## 80 30.57 42.77 50.22
+## 90 15.29 21.39 30.26
+## 100 0.00 0.00 0.00
+f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+ number=5,
+ classProbs=T,
+ verbose = T,
+ #sampling = "up",
+ savePredictions = T,
+ summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+ splt$train.y,
+ method = "pls",
+ family = "binomial",
+ metric = "ROC",
+ preProc = c("nzv","center", "scale"),
+ verboseIter = TRUE,
+ #tuneLength = 7,
+ tuneGrid = tune,
+ trControl = ctrl)
+## + Fold1: ncomp=2
+## - Fold1: ncomp=2
+## + Fold2: ncomp=2
+## - Fold2: ncomp=2
+## + Fold3: ncomp=2
+## - Fold3: ncomp=2
+## + Fold4: ncomp=2
+## - Fold4: ncomp=2
+## + Fold5: ncomp=2
+## - Fold5: ncomp=2
+## Aggregating results
+## Fitting final model on full training set
+# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction mid high
+## mid 1435 240
+## high 0 0
+##
+## Accuracy : 0.8567
+## 95% CI : (0.839, 0.8732)
+## No Information Rate : 0.8567
+## P-Value [Acc > NIR] : 0.5172
+##
+## Kappa : 0
+##
+## Mcnemar's Test P-Value : <2e-16
+##
+## Sensitivity : 1.0000
+## Specificity : 0.0000
+## Pos Pred Value : 0.8567
+## Neg Pred Value : NaN
+## Prevalence : 0.8567
+## Detection Rate : 0.8567
+## Detection Prevalence : 1.0000
+## Balanced Accuracy : 0.5000
+##
+## 'Positive' Class : mid
+##
+plot(varImp(model1, scale = F, useModel = F))
+roc(splt$test.observed_y,
+ splt$test.y_prob$mid,
+ legacy.axes=T,
+ plot = T,
+ lwd=2,
+ col="black",
+ print.auc=T,
+ percent = T,
+ print.auc.y = 0,
+ print.auc.x = 80,
+ lty = 1,
+ of = "se",
+ boot.n = 200,
+ ci = T)
+## Setting levels: control = mid, case = high
+## Setting direction: controls > cases
+##
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid, percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2, col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80, lty = 1, of = "se", boot.n = 200)
+##
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+## sp se.low se.median se.high
+## 0 100.00 100.00 100.00
+## 10 91.25 94.21 96.68
+## 20 86.25 90.00 93.34
+## 30 82.08 86.04 90.48
+## 40 72.68 79.58 85.83
+## 50 67.08 72.92 78.76
+## 60 58.75 66.46 74.60
+## 70 42.91 51.25 58.94
+## 80 27.07 34.17 40.43
+## 90 12.17 16.71 21.67
+## 100 0.00 0.00 0.00
+