diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index 0db19ef..e62332e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,14 +1,32 @@ --- output: html_document editor_options: - chunk_output_type: inline + chunk_output_type: console --- ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +options(repos=c(CRAN="http://cran.cnr.berkeley.edu/")) + #! =============================================== #! load required packages -# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) +# if(!require(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))){ +# install.packages(c("ggplot2", +# "tidyverse", +# "stringi", +# "pls", +# "caret", +# "here", +# "tsibble", +# "broom", +# "rsample", +# "inspectdf", +# "caTools", +# "pROC", +# "glmnet", +# "ROSE")) +# } + library(ggplot2) library(tidyverse) library(stringi) @@ -33,9 +51,17 @@ # Preprocessing + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + + +### Extract Lures The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -```{r} +```{r lures_function, include=FALSE} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -47,10 +73,6 @@ } ``` -After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. -The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. - -The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. ```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE} @@ -127,22 +149,24 @@ ```{r split_data} split_data <- function(f, data, .balance=F, p = 0.8) { - train.indices <- createDataPartition(data[[toString(f[[2]])]], + .train.indices <- createDataPartition(data[[toString(f[[2]])]], p = p, list =F) - .train.imbalanced <- data[train.indices,] + .train.imbalanced <- data[.train.indices,] - .train <- ifelse(.balance, - ROSE(f, data = .train.imbalanced)$data, - seqs.train.imbalanced) - - .train.x <- model.matrix(f, seqs.train)[,-1] + # .train <- ifelse(.balance==T, + # ROSE(f, data = .train.imbalanced)$data, + # .train.imbalanced) + .train <- .train.imbalanced - .train.y <- seqs.train[[toString(f[[2]])]] - .test <- seqs[-train.indices,] - .test.x <- model.matrix(f, seqs.test)[,-1] - .test.observed_y <- seqs.test[[toString(f[[2]])]] + .train.x <- model.matrix(f, .train)[,-1] + .train.y <- .train[[toString(f[[2]])]] + + .test <- data[-.train.indices,] + .test.x <- model.matrix(f, .test)[,-1] + .test.observed_y <- .test[[toString(f[[2]])]] + return(list( "train" = .train, "train.x" = .train.x, @@ -162,20 +186,35 @@ ```{r rfe_feature_selection} #f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll -#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat -f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat +f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat #f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat splt <- split_data(f, seqs) ctrl <- rfeControl(functions = rfFuncs, method = "cv", - number = 3, + number = 10, + rerank = T, verbose = T) -rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +#rfeProfile <- rfe(splt$train.x, splt$train.y, +rfeProfile <- rfe(f, seqs, + rfeControl = ctrl, + sizes = 2:10 + ) + + +rfeProfile$results %>% + as.data.frame() %>% + rownames_to_column() %>% + ggplot(aes(x = reorder(2:10, Accuracy), y = Accuracy)) + + geom_bar(stat = "identity", fill = "#1F77B4", alpha = 0.8) + + coord_flip() message("Optimal Variables: ") -rfeProfile$optVariables + +rfeProfile$metric # ROC for each categories filterVarImp(as.data.frame(splt$train.x), splt$train.y) diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index 0db19ef..e62332e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,14 +1,32 @@ --- output: html_document editor_options: - chunk_output_type: inline + chunk_output_type: console --- ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +options(repos=c(CRAN="http://cran.cnr.berkeley.edu/")) + #! =============================================== #! load required packages -# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) +# if(!require(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))){ +# install.packages(c("ggplot2", +# "tidyverse", +# "stringi", +# "pls", +# "caret", +# "here", +# "tsibble", +# "broom", +# "rsample", +# "inspectdf", +# "caTools", +# "pROC", +# "glmnet", +# "ROSE")) +# } + library(ggplot2) library(tidyverse) library(stringi) @@ -33,9 +51,17 @@ # Preprocessing + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + + +### Extract Lures The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -```{r} +```{r lures_function, include=FALSE} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -47,10 +73,6 @@ } ``` -After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. -The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. - -The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. ```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE} @@ -127,22 +149,24 @@ ```{r split_data} split_data <- function(f, data, .balance=F, p = 0.8) { - train.indices <- createDataPartition(data[[toString(f[[2]])]], + .train.indices <- createDataPartition(data[[toString(f[[2]])]], p = p, list =F) - .train.imbalanced <- data[train.indices,] + .train.imbalanced <- data[.train.indices,] - .train <- ifelse(.balance, - ROSE(f, data = .train.imbalanced)$data, - seqs.train.imbalanced) - - .train.x <- model.matrix(f, seqs.train)[,-1] + # .train <- ifelse(.balance==T, + # ROSE(f, data = .train.imbalanced)$data, + # .train.imbalanced) + .train <- .train.imbalanced - .train.y <- seqs.train[[toString(f[[2]])]] - .test <- seqs[-train.indices,] - .test.x <- model.matrix(f, seqs.test)[,-1] - .test.observed_y <- seqs.test[[toString(f[[2]])]] + .train.x <- model.matrix(f, .train)[,-1] + .train.y <- .train[[toString(f[[2]])]] + + .test <- data[-.train.indices,] + .test.x <- model.matrix(f, .test)[,-1] + .test.observed_y <- .test[[toString(f[[2]])]] + return(list( "train" = .train, "train.x" = .train.x, @@ -162,20 +186,35 @@ ```{r rfe_feature_selection} #f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll -#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat -f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat +f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat #f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat splt <- split_data(f, seqs) ctrl <- rfeControl(functions = rfFuncs, method = "cv", - number = 3, + number = 10, + rerank = T, verbose = T) -rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +#rfeProfile <- rfe(splt$train.x, splt$train.y, +rfeProfile <- rfe(f, seqs, + rfeControl = ctrl, + sizes = 2:10 + ) + + +rfeProfile$results %>% + as.data.frame() %>% + rownames_to_column() %>% + ggplot(aes(x = reorder(2:10, Accuracy), y = Accuracy)) + + geom_bar(stat = "identity", fill = "#1F77B4", alpha = 0.8) + + coord_flip() message("Optimal Variables: ") -rfeProfile$optVariables + +rfeProfile$metric # ROC for each categories filterVarImp(as.data.frame(splt$train.x), splt$train.y) diff --git a/ccn2019/ccn2019.rev3.html b/ccn2019/ccn2019.rev3.html new file mode 100644 index 0000000..fac97cb --- /dev/null +++ b/ccn2019/ccn2019.rev3.html @@ -0,0 +1,855 @@ + + + + + + + + + + + + + +ccn2019.rev3.utf8.md + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Preprocessing

+

After loading data set and marking lure trials, a sliding window of the size window_size will be used to extract statistics including t, l, sl, s, u, ul, tl, a, al for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.

+

The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.

+
+

Extract Lures

+

The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.

+

The following chunk marks highly correlated predictors, but does not work anymore.

+
# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+

split_data() is a function to split the cleaned up dataset into training and test. If .balance parameter is set to TRUE then the training set is oversampled with regard to the formula.

+
split_data <- function(f, data, .balance=F, p = 0.8) {
+  .train.indices <- createDataPartition(data[[toString(f[[2]])]], 
+                                       p = p, 
+                                       list =F)
+  
+  .train.imbalanced <- data[.train.indices,]
+  
+  # .train <- ifelse(.balance==T, 
+  #                  ROSE(f, data = .train.imbalanced)$data,
+  #                  .train.imbalanced)
+  .train <- .train.imbalanced
+
+  .train.x <- model.matrix(f, .train)[,-1]
+  .train.y <- .train[[toString(f[[2]])]]
+  
+  .test  <- data[-.train.indices,]
+  .test.x <-  model.matrix(f, .test)[,-1]
+  .test.observed_y <- .test[[toString(f[[2]])]]
+  
+  return(list(
+    "train" = .train,
+    "train.x" = .train.x,
+    "train.y" = .train.y,
+    "test" = .test,
+    "test.x" = .test.x,
+    "test.observed_y" = .test.observed_y
+  ))
+}
+
+
+

Feature Selection

+

the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat. To change parameters, modify the f formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.

+
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   rerank = T,
+                   verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+
## +(rfe) fit Fold1 size: 10 
+## -(rfe) fit Fold1 size: 10 
+## +(rfe) imp Fold1 
+## -(rfe) imp Fold1 
+## +(rfe) fit Fold1 size:  8 
+## -(rfe) fit Fold1 size:  8 
+## +(rfe) imp Fold1 size:  8 
+## -(rfe) imp Fold1 size:  8 
+## +(rfe) fit Fold1 size:  4 
+## -(rfe) fit Fold1 size:  4 
+## +(rfe) imp Fold1 size:  4 
+## -(rfe) imp Fold1 size:  4 
+## +(rfe) fit Fold2 size: 10 
+## -(rfe) fit Fold2 size: 10 
+## +(rfe) imp Fold2 
+## -(rfe) imp Fold2 
+## +(rfe) fit Fold2 size:  8 
+## -(rfe) fit Fold2 size:  8 
+## +(rfe) imp Fold2 size:  8 
+## -(rfe) imp Fold2 size:  8 
+## +(rfe) fit Fold2 size:  4 
+## -(rfe) fit Fold2 size:  4 
+## +(rfe) imp Fold2 size:  4 
+## -(rfe) imp Fold2 size:  4 
+## +(rfe) fit Fold3 size: 10 
+## -(rfe) fit Fold3 size: 10 
+## +(rfe) imp Fold3 
+## -(rfe) imp Fold3 
+## +(rfe) fit Fold3 size:  8 
+## -(rfe) fit Fold3 size:  8 
+## +(rfe) imp Fold3 size:  8 
+## -(rfe) imp Fold3 size:  8 
+## +(rfe) fit Fold3 size:  4 
+## -(rfe) fit Fold3 size:  4 
+## +(rfe) imp Fold3 size:  4 
+## -(rfe) imp Fold3 size:  4
+
message("Optimal Variables: ")
+
## Optimal Variables:
+
rfeProfile$optVariables
+
## [1] "n"  "tl" "vl" "sl"
+
# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+
##       Overall
+## n  15.6157510
+## t   0.4033237
+## v   3.2527774
+## s   3.6791786
+## l   2.9241600
+## vl  0.1725283
+## sl  2.7719808
+## tl  4.8728706
+## ul  2.9101769
+## ll  3.1461070
+
+
+

Analysis of Correctness

+

Correctness refers to the correct column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).

+

The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.

+

In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.

+

Note: uncomment desired old.f and new.f formulas to run respective analysis.

+
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer 
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+  
+train_model <- function(f, data, ctrl) {
+  splt <- split_data(f, data, .balance = T)
+  model <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               tuneLength = 5,
+               #tuneGrid = tune,
+               trControl = ctrl)
+  list(model  = model,
+       test.y = model %>% predict(splt$test.x),
+       test.y_prob = model %>% predict(splt$test.x, type="prob"),
+       test.observed_y = splt$test.observed_y
+  )
+}
+
+old.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(old.f, ., ctrl)
+
## + Fold1: ncomp=1 
+## - Fold1: ncomp=1 
+## + Fold2: ncomp=1 
+## - Fold2: ncomp=1 
+## + Fold3: ncomp=1 
+## - Fold3: ncomp=1 
+## + Fold4: ncomp=1 
+## - Fold4: ncomp=1 
+## + Fold5: ncomp=1 
+## - Fold5: ncomp=1 
+## Aggregating results
+## Fitting final model on full training set
+
new.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(new.f, ., ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+
bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
confusionMatrix(old.model$test.y, old.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO   93 771
+##        YES  39 771
+##                                           
+##                Accuracy : 0.5161          
+##                  95% CI : (0.4919, 0.5403)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.0579          
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.70455         
+##             Specificity : 0.50000         
+##          Pos Pred Value : 0.10764         
+##          Neg Pred Value : 0.95185         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.05556         
+##    Detection Prevalence : 0.51613         
+##       Balanced Accuracy : 0.60227         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(old.model$model, scale = F, useModel = F))
+

+
confusionMatrix(new.model$test.y, new.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO  103 835
+##        YES  29 707
+##                                           
+##                Accuracy : 0.4839          
+##                  95% CI : (0.4597, 0.5081)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.063           
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.78030         
+##             Specificity : 0.45850         
+##          Pos Pred Value : 0.10981         
+##          Neg Pred Value : 0.96060         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.06153         
+##    Detection Prevalence : 0.56033         
+##       Balanced Accuracy : 0.61940         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(new.model$model, scale = F, useModel = F))
+

+
roc(old.model$test.observed_y,
+    old.model$test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="gray",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+
## Setting direction: controls < cases
+
## 
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55,     lty = 1, of = "se", boot.n = 100)
+## 
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.95     92.93   93.60
+##   20  83.90     85.86   87.19
+##   30  75.86     78.79   80.79
+##   40  67.81     71.72   74.39
+##   50  59.76     64.66   67.99
+##   60  51.31     57.59   61.58
+##   70  39.66     50.52   55.18
+##   80  26.44     34.42   44.81
+##   90  13.22     17.21   22.41
+##  100   0.00      0.00    0.00
+
roc(new.model$test.observed_y,
+    new.model$test.y_prob$YES,
+    legacy.axes=T,
+    add = T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 50,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+

+
## 
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, add = T,     lwd = 2, col = "black", print.auc = T, print.auc.y = 50,     print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+## 
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  95.16     96.29   97.19
+##   20  89.97     92.58   94.37
+##   30  82.85     87.98   91.53
+##   40  74.06     79.70   85.61
+##   50  65.48     70.92   76.26
+##   60  56.18     62.02   67.40
+##   70  45.65     53.38   58.83
+##   80  30.57     42.77   50.22
+##   90  15.29     21.39   30.26
+##  100   0.00      0.00    0.00
+
+
+

RT Analysis

+
f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     #sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               #tuneLength = 7,
+               tuneGrid = tune,
+               trControl = ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Fitting final model on full training set
+
# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  mid high
+##       mid  1435  240
+##       high    0    0
+##                                          
+##                Accuracy : 0.8567         
+##                  95% CI : (0.839, 0.8732)
+##     No Information Rate : 0.8567         
+##     P-Value [Acc > NIR] : 0.5172         
+##                                          
+##                   Kappa : 0              
+##                                          
+##  Mcnemar's Test P-Value : <2e-16         
+##                                          
+##             Sensitivity : 1.0000         
+##             Specificity : 0.0000         
+##          Pos Pred Value : 0.8567         
+##          Neg Pred Value :    NaN         
+##              Prevalence : 0.8567         
+##          Detection Rate : 0.8567         
+##    Detection Prevalence : 1.0000         
+##       Balanced Accuracy : 0.5000         
+##                                          
+##        'Positive' Class : mid            
+## 
+
plot(varImp(model1, scale = F, useModel = F))
+

+
roc(splt$test.observed_y,
+    splt$test.y_prob$mid,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 0,
+    print.auc.x = 80,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
## Setting levels: control = mid, case = high
+
## Setting direction: controls > cases
+

+
## 
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80,     lty = 1, of = "se", boot.n = 200)
+## 
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.25     94.21   96.68
+##   20  86.25     90.00   93.34
+##   30  82.08     86.04   90.48
+##   40  72.68     79.58   85.83
+##   50  67.08     72.92   78.76
+##   60  58.75     66.46   74.60
+##   70  42.91     51.25   58.94
+##   80  27.07     34.17   40.43
+##   90  12.17     16.71   21.67
+##  100   0.00      0.00    0.00
+
+
+ + + + +
+ + + + + + + + diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index 0db19ef..e62332e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,14 +1,32 @@ --- output: html_document editor_options: - chunk_output_type: inline + chunk_output_type: console --- ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +options(repos=c(CRAN="http://cran.cnr.berkeley.edu/")) + #! =============================================== #! load required packages -# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) +# if(!require(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))){ +# install.packages(c("ggplot2", +# "tidyverse", +# "stringi", +# "pls", +# "caret", +# "here", +# "tsibble", +# "broom", +# "rsample", +# "inspectdf", +# "caTools", +# "pROC", +# "glmnet", +# "ROSE")) +# } + library(ggplot2) library(tidyverse) library(stringi) @@ -33,9 +51,17 @@ # Preprocessing + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + + +### Extract Lures The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -```{r} +```{r lures_function, include=FALSE} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -47,10 +73,6 @@ } ``` -After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. -The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. - -The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. ```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE} @@ -127,22 +149,24 @@ ```{r split_data} split_data <- function(f, data, .balance=F, p = 0.8) { - train.indices <- createDataPartition(data[[toString(f[[2]])]], + .train.indices <- createDataPartition(data[[toString(f[[2]])]], p = p, list =F) - .train.imbalanced <- data[train.indices,] + .train.imbalanced <- data[.train.indices,] - .train <- ifelse(.balance, - ROSE(f, data = .train.imbalanced)$data, - seqs.train.imbalanced) - - .train.x <- model.matrix(f, seqs.train)[,-1] + # .train <- ifelse(.balance==T, + # ROSE(f, data = .train.imbalanced)$data, + # .train.imbalanced) + .train <- .train.imbalanced - .train.y <- seqs.train[[toString(f[[2]])]] - .test <- seqs[-train.indices,] - .test.x <- model.matrix(f, seqs.test)[,-1] - .test.observed_y <- seqs.test[[toString(f[[2]])]] + .train.x <- model.matrix(f, .train)[,-1] + .train.y <- .train[[toString(f[[2]])]] + + .test <- data[-.train.indices,] + .test.x <- model.matrix(f, .test)[,-1] + .test.observed_y <- .test[[toString(f[[2]])]] + return(list( "train" = .train, "train.x" = .train.x, @@ -162,20 +186,35 @@ ```{r rfe_feature_selection} #f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll -#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat -f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat +f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat #f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat splt <- split_data(f, seqs) ctrl <- rfeControl(functions = rfFuncs, method = "cv", - number = 3, + number = 10, + rerank = T, verbose = T) -rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +#rfeProfile <- rfe(splt$train.x, splt$train.y, +rfeProfile <- rfe(f, seqs, + rfeControl = ctrl, + sizes = 2:10 + ) + + +rfeProfile$results %>% + as.data.frame() %>% + rownames_to_column() %>% + ggplot(aes(x = reorder(2:10, Accuracy), y = Accuracy)) + + geom_bar(stat = "identity", fill = "#1F77B4", alpha = 0.8) + + coord_flip() message("Optimal Variables: ") -rfeProfile$optVariables + +rfeProfile$metric # ROC for each categories filterVarImp(as.data.frame(splt$train.x), splt$train.y) diff --git a/ccn2019/ccn2019.rev3.html b/ccn2019/ccn2019.rev3.html new file mode 100644 index 0000000..fac97cb --- /dev/null +++ b/ccn2019/ccn2019.rev3.html @@ -0,0 +1,855 @@ + + + + + + + + + + + + + +ccn2019.rev3.utf8.md + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Preprocessing

+

After loading data set and marking lure trials, a sliding window of the size window_size will be used to extract statistics including t, l, sl, s, u, ul, tl, a, al for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.

+

The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.

+
+

Extract Lures

+

The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.

+

The following chunk marks highly correlated predictors, but does not work anymore.

+
# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+

split_data() is a function to split the cleaned up dataset into training and test. If .balance parameter is set to TRUE then the training set is oversampled with regard to the formula.

+
split_data <- function(f, data, .balance=F, p = 0.8) {
+  .train.indices <- createDataPartition(data[[toString(f[[2]])]], 
+                                       p = p, 
+                                       list =F)
+  
+  .train.imbalanced <- data[.train.indices,]
+  
+  # .train <- ifelse(.balance==T, 
+  #                  ROSE(f, data = .train.imbalanced)$data,
+  #                  .train.imbalanced)
+  .train <- .train.imbalanced
+
+  .train.x <- model.matrix(f, .train)[,-1]
+  .train.y <- .train[[toString(f[[2]])]]
+  
+  .test  <- data[-.train.indices,]
+  .test.x <-  model.matrix(f, .test)[,-1]
+  .test.observed_y <- .test[[toString(f[[2]])]]
+  
+  return(list(
+    "train" = .train,
+    "train.x" = .train.x,
+    "train.y" = .train.y,
+    "test" = .test,
+    "test.x" = .test.x,
+    "test.observed_y" = .test.observed_y
+  ))
+}
+
+
+

Feature Selection

+

the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat. To change parameters, modify the f formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.

+
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   rerank = T,
+                   verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+
## +(rfe) fit Fold1 size: 10 
+## -(rfe) fit Fold1 size: 10 
+## +(rfe) imp Fold1 
+## -(rfe) imp Fold1 
+## +(rfe) fit Fold1 size:  8 
+## -(rfe) fit Fold1 size:  8 
+## +(rfe) imp Fold1 size:  8 
+## -(rfe) imp Fold1 size:  8 
+## +(rfe) fit Fold1 size:  4 
+## -(rfe) fit Fold1 size:  4 
+## +(rfe) imp Fold1 size:  4 
+## -(rfe) imp Fold1 size:  4 
+## +(rfe) fit Fold2 size: 10 
+## -(rfe) fit Fold2 size: 10 
+## +(rfe) imp Fold2 
+## -(rfe) imp Fold2 
+## +(rfe) fit Fold2 size:  8 
+## -(rfe) fit Fold2 size:  8 
+## +(rfe) imp Fold2 size:  8 
+## -(rfe) imp Fold2 size:  8 
+## +(rfe) fit Fold2 size:  4 
+## -(rfe) fit Fold2 size:  4 
+## +(rfe) imp Fold2 size:  4 
+## -(rfe) imp Fold2 size:  4 
+## +(rfe) fit Fold3 size: 10 
+## -(rfe) fit Fold3 size: 10 
+## +(rfe) imp Fold3 
+## -(rfe) imp Fold3 
+## +(rfe) fit Fold3 size:  8 
+## -(rfe) fit Fold3 size:  8 
+## +(rfe) imp Fold3 size:  8 
+## -(rfe) imp Fold3 size:  8 
+## +(rfe) fit Fold3 size:  4 
+## -(rfe) fit Fold3 size:  4 
+## +(rfe) imp Fold3 size:  4 
+## -(rfe) imp Fold3 size:  4
+
message("Optimal Variables: ")
+
## Optimal Variables:
+
rfeProfile$optVariables
+
## [1] "n"  "tl" "vl" "sl"
+
# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+
##       Overall
+## n  15.6157510
+## t   0.4033237
+## v   3.2527774
+## s   3.6791786
+## l   2.9241600
+## vl  0.1725283
+## sl  2.7719808
+## tl  4.8728706
+## ul  2.9101769
+## ll  3.1461070
+
+
+

Analysis of Correctness

+

Correctness refers to the correct column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).

+

The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.

+

In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.

+

Note: uncomment desired old.f and new.f formulas to run respective analysis.

+
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer 
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+  
+train_model <- function(f, data, ctrl) {
+  splt <- split_data(f, data, .balance = T)
+  model <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               tuneLength = 5,
+               #tuneGrid = tune,
+               trControl = ctrl)
+  list(model  = model,
+       test.y = model %>% predict(splt$test.x),
+       test.y_prob = model %>% predict(splt$test.x, type="prob"),
+       test.observed_y = splt$test.observed_y
+  )
+}
+
+old.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(old.f, ., ctrl)
+
## + Fold1: ncomp=1 
+## - Fold1: ncomp=1 
+## + Fold2: ncomp=1 
+## - Fold2: ncomp=1 
+## + Fold3: ncomp=1 
+## - Fold3: ncomp=1 
+## + Fold4: ncomp=1 
+## - Fold4: ncomp=1 
+## + Fold5: ncomp=1 
+## - Fold5: ncomp=1 
+## Aggregating results
+## Fitting final model on full training set
+
new.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(new.f, ., ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+
bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
confusionMatrix(old.model$test.y, old.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO   93 771
+##        YES  39 771
+##                                           
+##                Accuracy : 0.5161          
+##                  95% CI : (0.4919, 0.5403)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.0579          
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.70455         
+##             Specificity : 0.50000         
+##          Pos Pred Value : 0.10764         
+##          Neg Pred Value : 0.95185         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.05556         
+##    Detection Prevalence : 0.51613         
+##       Balanced Accuracy : 0.60227         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(old.model$model, scale = F, useModel = F))
+

+
confusionMatrix(new.model$test.y, new.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO  103 835
+##        YES  29 707
+##                                           
+##                Accuracy : 0.4839          
+##                  95% CI : (0.4597, 0.5081)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.063           
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.78030         
+##             Specificity : 0.45850         
+##          Pos Pred Value : 0.10981         
+##          Neg Pred Value : 0.96060         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.06153         
+##    Detection Prevalence : 0.56033         
+##       Balanced Accuracy : 0.61940         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(new.model$model, scale = F, useModel = F))
+

+
roc(old.model$test.observed_y,
+    old.model$test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="gray",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+
## Setting direction: controls < cases
+
## 
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55,     lty = 1, of = "se", boot.n = 100)
+## 
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.95     92.93   93.60
+##   20  83.90     85.86   87.19
+##   30  75.86     78.79   80.79
+##   40  67.81     71.72   74.39
+##   50  59.76     64.66   67.99
+##   60  51.31     57.59   61.58
+##   70  39.66     50.52   55.18
+##   80  26.44     34.42   44.81
+##   90  13.22     17.21   22.41
+##  100   0.00      0.00    0.00
+
roc(new.model$test.observed_y,
+    new.model$test.y_prob$YES,
+    legacy.axes=T,
+    add = T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 50,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+

+
## 
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, add = T,     lwd = 2, col = "black", print.auc = T, print.auc.y = 50,     print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+## 
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  95.16     96.29   97.19
+##   20  89.97     92.58   94.37
+##   30  82.85     87.98   91.53
+##   40  74.06     79.70   85.61
+##   50  65.48     70.92   76.26
+##   60  56.18     62.02   67.40
+##   70  45.65     53.38   58.83
+##   80  30.57     42.77   50.22
+##   90  15.29     21.39   30.26
+##  100   0.00      0.00    0.00
+
+
+

RT Analysis

+
f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     #sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               #tuneLength = 7,
+               tuneGrid = tune,
+               trControl = ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Fitting final model on full training set
+
# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  mid high
+##       mid  1435  240
+##       high    0    0
+##                                          
+##                Accuracy : 0.8567         
+##                  95% CI : (0.839, 0.8732)
+##     No Information Rate : 0.8567         
+##     P-Value [Acc > NIR] : 0.5172         
+##                                          
+##                   Kappa : 0              
+##                                          
+##  Mcnemar's Test P-Value : <2e-16         
+##                                          
+##             Sensitivity : 1.0000         
+##             Specificity : 0.0000         
+##          Pos Pred Value : 0.8567         
+##          Neg Pred Value :    NaN         
+##              Prevalence : 0.8567         
+##          Detection Rate : 0.8567         
+##    Detection Prevalence : 1.0000         
+##       Balanced Accuracy : 0.5000         
+##                                          
+##        'Positive' Class : mid            
+## 
+
plot(varImp(model1, scale = F, useModel = F))
+

+
roc(splt$test.observed_y,
+    splt$test.y_prob$mid,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 0,
+    print.auc.x = 80,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
## Setting levels: control = mid, case = high
+
## Setting direction: controls > cases
+

+
## 
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80,     lty = 1, of = "se", boot.n = 200)
+## 
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.25     94.21   96.68
+##   20  86.25     90.00   93.34
+##   30  82.08     86.04   90.48
+##   40  72.68     79.58   85.83
+##   50  67.08     72.92   78.76
+##   60  58.75     66.46   74.60
+##   70  42.91     51.25   58.94
+##   80  27.07     34.17   40.43
+##   90  12.17     16.71   21.67
+##  100   0.00      0.00    0.00
+
+
+ + + + +
+ + + + + + + + diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R index 7da6569..5ae7299 100644 --- a/ccn2019/ccn2019_diagrams.R +++ b/ccn2019/ccn2019_diagrams.R @@ -49,3 +49,70 @@ mutate(condition=ifelse(condition=='2-back',2,3)) %>% inspect_cor(show_plot = T) + + + +base.df <- data.frame(x=100-base.roc$specificities, + y=base.roc$sensitivities, + auc = base.roc$auc[1], + model="base") + +extd.df <- data.frame(x=100-extd.roc$specificities, + y=extd.roc$sensitivities, + auc = extd.roc$auc[1], + model="extended") + +chance.df <- data.frame(x=1:100, y=1:100, model=" ", auc=50) + +library(ggrepel) +dats <- rbind(extd.df, base.df, chance.df) + +to_auc_label <- function(model, auc) { + paste(model, + "\nAUC=", + format(auc, digits=4), + sep = "" + ) +} + +dats$label = NA +dats[174,]$label = to_auc_label("Extended Model", dats[174,]$auc) +dats[647,]$label = to_auc_label("Base Model", dats[647,]$auc) + + +dats %>% + ggplot(aes(x=x, y=y, + group=model, + color = model, + linetype = factor(model))) + + geom_line() + + labs(title="AUCs for the base and extended models") + + geom_label_repel(aes(label=label), na.rm = TRUE, box.padding = 2) + + xlab("100% - Specificity") + + ylab("Sensitivity") + + theme_linedraw() + + #scale_x_continuous(labels = scales::percent) + + #scale_y_continuous(labels = scales::percent) + + scale_fill_brewer(palette = "Greens") + + scale_color_manual(values=c("black", "#808080", "gray")) + + theme(legend.position = "none") + +ggsave("fig1.png", plot = last_plot(), width = 4, height = 4) + +boruta_scores %>% + mutate(feature = row.names(.)) %>% + arrange(meanImp) %>% + ggplot(aes(x=reorder(feature,-meanImp), y=meanImp, fill=decision)) + + geom_bar(stat = "identity") + + ylab("Relative Importance Score") + + xlab("Feature") + + theme_linedraw() + + scale_fill_grey() + + labs(fill = "Selection Decision") + + theme( + legend.position = c(.95, .95), + legend.justification = c("right", "top"), + legend.box.just = "right", + legend.margin = margin(6, 6, 6, 6) + ) + diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index 0db19ef..e62332e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,14 +1,32 @@ --- output: html_document editor_options: - chunk_output_type: inline + chunk_output_type: console --- ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +options(repos=c(CRAN="http://cran.cnr.berkeley.edu/")) + #! =============================================== #! load required packages -# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) +# if(!require(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))){ +# install.packages(c("ggplot2", +# "tidyverse", +# "stringi", +# "pls", +# "caret", +# "here", +# "tsibble", +# "broom", +# "rsample", +# "inspectdf", +# "caTools", +# "pROC", +# "glmnet", +# "ROSE")) +# } + library(ggplot2) library(tidyverse) library(stringi) @@ -33,9 +51,17 @@ # Preprocessing + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + + +### Extract Lures The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -```{r} +```{r lures_function, include=FALSE} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -47,10 +73,6 @@ } ``` -After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. -The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. - -The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. ```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE} @@ -127,22 +149,24 @@ ```{r split_data} split_data <- function(f, data, .balance=F, p = 0.8) { - train.indices <- createDataPartition(data[[toString(f[[2]])]], + .train.indices <- createDataPartition(data[[toString(f[[2]])]], p = p, list =F) - .train.imbalanced <- data[train.indices,] + .train.imbalanced <- data[.train.indices,] - .train <- ifelse(.balance, - ROSE(f, data = .train.imbalanced)$data, - seqs.train.imbalanced) - - .train.x <- model.matrix(f, seqs.train)[,-1] + # .train <- ifelse(.balance==T, + # ROSE(f, data = .train.imbalanced)$data, + # .train.imbalanced) + .train <- .train.imbalanced - .train.y <- seqs.train[[toString(f[[2]])]] - .test <- seqs[-train.indices,] - .test.x <- model.matrix(f, seqs.test)[,-1] - .test.observed_y <- seqs.test[[toString(f[[2]])]] + .train.x <- model.matrix(f, .train)[,-1] + .train.y <- .train[[toString(f[[2]])]] + + .test <- data[-.train.indices,] + .test.x <- model.matrix(f, .test)[,-1] + .test.observed_y <- .test[[toString(f[[2]])]] + return(list( "train" = .train, "train.x" = .train.x, @@ -162,20 +186,35 @@ ```{r rfe_feature_selection} #f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll -#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat -f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat +f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat #f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat splt <- split_data(f, seqs) ctrl <- rfeControl(functions = rfFuncs, method = "cv", - number = 3, + number = 10, + rerank = T, verbose = T) -rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +#rfeProfile <- rfe(splt$train.x, splt$train.y, +rfeProfile <- rfe(f, seqs, + rfeControl = ctrl, + sizes = 2:10 + ) + + +rfeProfile$results %>% + as.data.frame() %>% + rownames_to_column() %>% + ggplot(aes(x = reorder(2:10, Accuracy), y = Accuracy)) + + geom_bar(stat = "identity", fill = "#1F77B4", alpha = 0.8) + + coord_flip() message("Optimal Variables: ") -rfeProfile$optVariables + +rfeProfile$metric # ROC for each categories filterVarImp(as.data.frame(splt$train.x), splt$train.y) diff --git a/ccn2019/ccn2019.rev3.html b/ccn2019/ccn2019.rev3.html new file mode 100644 index 0000000..fac97cb --- /dev/null +++ b/ccn2019/ccn2019.rev3.html @@ -0,0 +1,855 @@ + + + + + + + + + + + + + +ccn2019.rev3.utf8.md + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Preprocessing

+

After loading data set and marking lure trials, a sliding window of the size window_size will be used to extract statistics including t, l, sl, s, u, ul, tl, a, al for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.

+

The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.

+
+

Extract Lures

+

The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.

+

The following chunk marks highly correlated predictors, but does not work anymore.

+
# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+

split_data() is a function to split the cleaned up dataset into training and test. If .balance parameter is set to TRUE then the training set is oversampled with regard to the formula.

+
split_data <- function(f, data, .balance=F, p = 0.8) {
+  .train.indices <- createDataPartition(data[[toString(f[[2]])]], 
+                                       p = p, 
+                                       list =F)
+  
+  .train.imbalanced <- data[.train.indices,]
+  
+  # .train <- ifelse(.balance==T, 
+  #                  ROSE(f, data = .train.imbalanced)$data,
+  #                  .train.imbalanced)
+  .train <- .train.imbalanced
+
+  .train.x <- model.matrix(f, .train)[,-1]
+  .train.y <- .train[[toString(f[[2]])]]
+  
+  .test  <- data[-.train.indices,]
+  .test.x <-  model.matrix(f, .test)[,-1]
+  .test.observed_y <- .test[[toString(f[[2]])]]
+  
+  return(list(
+    "train" = .train,
+    "train.x" = .train.x,
+    "train.y" = .train.y,
+    "test" = .test,
+    "test.x" = .test.x,
+    "test.observed_y" = .test.observed_y
+  ))
+}
+
+
+

Feature Selection

+

the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat. To change parameters, modify the f formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.

+
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   rerank = T,
+                   verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+
## +(rfe) fit Fold1 size: 10 
+## -(rfe) fit Fold1 size: 10 
+## +(rfe) imp Fold1 
+## -(rfe) imp Fold1 
+## +(rfe) fit Fold1 size:  8 
+## -(rfe) fit Fold1 size:  8 
+## +(rfe) imp Fold1 size:  8 
+## -(rfe) imp Fold1 size:  8 
+## +(rfe) fit Fold1 size:  4 
+## -(rfe) fit Fold1 size:  4 
+## +(rfe) imp Fold1 size:  4 
+## -(rfe) imp Fold1 size:  4 
+## +(rfe) fit Fold2 size: 10 
+## -(rfe) fit Fold2 size: 10 
+## +(rfe) imp Fold2 
+## -(rfe) imp Fold2 
+## +(rfe) fit Fold2 size:  8 
+## -(rfe) fit Fold2 size:  8 
+## +(rfe) imp Fold2 size:  8 
+## -(rfe) imp Fold2 size:  8 
+## +(rfe) fit Fold2 size:  4 
+## -(rfe) fit Fold2 size:  4 
+## +(rfe) imp Fold2 size:  4 
+## -(rfe) imp Fold2 size:  4 
+## +(rfe) fit Fold3 size: 10 
+## -(rfe) fit Fold3 size: 10 
+## +(rfe) imp Fold3 
+## -(rfe) imp Fold3 
+## +(rfe) fit Fold3 size:  8 
+## -(rfe) fit Fold3 size:  8 
+## +(rfe) imp Fold3 size:  8 
+## -(rfe) imp Fold3 size:  8 
+## +(rfe) fit Fold3 size:  4 
+## -(rfe) fit Fold3 size:  4 
+## +(rfe) imp Fold3 size:  4 
+## -(rfe) imp Fold3 size:  4
+
message("Optimal Variables: ")
+
## Optimal Variables:
+
rfeProfile$optVariables
+
## [1] "n"  "tl" "vl" "sl"
+
# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+
##       Overall
+## n  15.6157510
+## t   0.4033237
+## v   3.2527774
+## s   3.6791786
+## l   2.9241600
+## vl  0.1725283
+## sl  2.7719808
+## tl  4.8728706
+## ul  2.9101769
+## ll  3.1461070
+
+
+

Analysis of Correctness

+

Correctness refers to the correct column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).

+

The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.

+

In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.

+

Note: uncomment desired old.f and new.f formulas to run respective analysis.

+
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer 
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+  
+train_model <- function(f, data, ctrl) {
+  splt <- split_data(f, data, .balance = T)
+  model <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               tuneLength = 5,
+               #tuneGrid = tune,
+               trControl = ctrl)
+  list(model  = model,
+       test.y = model %>% predict(splt$test.x),
+       test.y_prob = model %>% predict(splt$test.x, type="prob"),
+       test.observed_y = splt$test.observed_y
+  )
+}
+
+old.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(old.f, ., ctrl)
+
## + Fold1: ncomp=1 
+## - Fold1: ncomp=1 
+## + Fold2: ncomp=1 
+## - Fold2: ncomp=1 
+## + Fold3: ncomp=1 
+## - Fold3: ncomp=1 
+## + Fold4: ncomp=1 
+## - Fold4: ncomp=1 
+## + Fold5: ncomp=1 
+## - Fold5: ncomp=1 
+## Aggregating results
+## Fitting final model on full training set
+
new.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(new.f, ., ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+
bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
confusionMatrix(old.model$test.y, old.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO   93 771
+##        YES  39 771
+##                                           
+##                Accuracy : 0.5161          
+##                  95% CI : (0.4919, 0.5403)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.0579          
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.70455         
+##             Specificity : 0.50000         
+##          Pos Pred Value : 0.10764         
+##          Neg Pred Value : 0.95185         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.05556         
+##    Detection Prevalence : 0.51613         
+##       Balanced Accuracy : 0.60227         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(old.model$model, scale = F, useModel = F))
+

+
confusionMatrix(new.model$test.y, new.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO  103 835
+##        YES  29 707
+##                                           
+##                Accuracy : 0.4839          
+##                  95% CI : (0.4597, 0.5081)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.063           
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.78030         
+##             Specificity : 0.45850         
+##          Pos Pred Value : 0.10981         
+##          Neg Pred Value : 0.96060         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.06153         
+##    Detection Prevalence : 0.56033         
+##       Balanced Accuracy : 0.61940         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(new.model$model, scale = F, useModel = F))
+

+
roc(old.model$test.observed_y,
+    old.model$test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="gray",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+
## Setting direction: controls < cases
+
## 
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55,     lty = 1, of = "se", boot.n = 100)
+## 
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.95     92.93   93.60
+##   20  83.90     85.86   87.19
+##   30  75.86     78.79   80.79
+##   40  67.81     71.72   74.39
+##   50  59.76     64.66   67.99
+##   60  51.31     57.59   61.58
+##   70  39.66     50.52   55.18
+##   80  26.44     34.42   44.81
+##   90  13.22     17.21   22.41
+##  100   0.00      0.00    0.00
+
roc(new.model$test.observed_y,
+    new.model$test.y_prob$YES,
+    legacy.axes=T,
+    add = T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 50,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+

+
## 
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, add = T,     lwd = 2, col = "black", print.auc = T, print.auc.y = 50,     print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+## 
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  95.16     96.29   97.19
+##   20  89.97     92.58   94.37
+##   30  82.85     87.98   91.53
+##   40  74.06     79.70   85.61
+##   50  65.48     70.92   76.26
+##   60  56.18     62.02   67.40
+##   70  45.65     53.38   58.83
+##   80  30.57     42.77   50.22
+##   90  15.29     21.39   30.26
+##  100   0.00      0.00    0.00
+
+
+

RT Analysis

+
f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     #sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               #tuneLength = 7,
+               tuneGrid = tune,
+               trControl = ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Fitting final model on full training set
+
# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  mid high
+##       mid  1435  240
+##       high    0    0
+##                                          
+##                Accuracy : 0.8567         
+##                  95% CI : (0.839, 0.8732)
+##     No Information Rate : 0.8567         
+##     P-Value [Acc > NIR] : 0.5172         
+##                                          
+##                   Kappa : 0              
+##                                          
+##  Mcnemar's Test P-Value : <2e-16         
+##                                          
+##             Sensitivity : 1.0000         
+##             Specificity : 0.0000         
+##          Pos Pred Value : 0.8567         
+##          Neg Pred Value :    NaN         
+##              Prevalence : 0.8567         
+##          Detection Rate : 0.8567         
+##    Detection Prevalence : 1.0000         
+##       Balanced Accuracy : 0.5000         
+##                                          
+##        'Positive' Class : mid            
+## 
+
plot(varImp(model1, scale = F, useModel = F))
+

+
roc(splt$test.observed_y,
+    splt$test.y_prob$mid,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 0,
+    print.auc.x = 80,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
## Setting levels: control = mid, case = high
+
## Setting direction: controls > cases
+

+
## 
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80,     lty = 1, of = "se", boot.n = 200)
+## 
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.25     94.21   96.68
+##   20  86.25     90.00   93.34
+##   30  82.08     86.04   90.48
+##   40  72.68     79.58   85.83
+##   50  67.08     72.92   78.76
+##   60  58.75     66.46   74.60
+##   70  42.91     51.25   58.94
+##   80  27.07     34.17   40.43
+##   90  12.17     16.71   21.67
+##  100   0.00      0.00    0.00
+
+
+ + + + +
+ + + + + + + + diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R index 7da6569..5ae7299 100644 --- a/ccn2019/ccn2019_diagrams.R +++ b/ccn2019/ccn2019_diagrams.R @@ -49,3 +49,70 @@ mutate(condition=ifelse(condition=='2-back',2,3)) %>% inspect_cor(show_plot = T) + + + +base.df <- data.frame(x=100-base.roc$specificities, + y=base.roc$sensitivities, + auc = base.roc$auc[1], + model="base") + +extd.df <- data.frame(x=100-extd.roc$specificities, + y=extd.roc$sensitivities, + auc = extd.roc$auc[1], + model="extended") + +chance.df <- data.frame(x=1:100, y=1:100, model=" ", auc=50) + +library(ggrepel) +dats <- rbind(extd.df, base.df, chance.df) + +to_auc_label <- function(model, auc) { + paste(model, + "\nAUC=", + format(auc, digits=4), + sep = "" + ) +} + +dats$label = NA +dats[174,]$label = to_auc_label("Extended Model", dats[174,]$auc) +dats[647,]$label = to_auc_label("Base Model", dats[647,]$auc) + + +dats %>% + ggplot(aes(x=x, y=y, + group=model, + color = model, + linetype = factor(model))) + + geom_line() + + labs(title="AUCs for the base and extended models") + + geom_label_repel(aes(label=label), na.rm = TRUE, box.padding = 2) + + xlab("100% - Specificity") + + ylab("Sensitivity") + + theme_linedraw() + + #scale_x_continuous(labels = scales::percent) + + #scale_y_continuous(labels = scales::percent) + + scale_fill_brewer(palette = "Greens") + + scale_color_manual(values=c("black", "#808080", "gray")) + + theme(legend.position = "none") + +ggsave("fig1.png", plot = last_plot(), width = 4, height = 4) + +boruta_scores %>% + mutate(feature = row.names(.)) %>% + arrange(meanImp) %>% + ggplot(aes(x=reorder(feature,-meanImp), y=meanImp, fill=decision)) + + geom_bar(stat = "identity") + + ylab("Relative Importance Score") + + xlab("Feature") + + theme_linedraw() + + scale_fill_grey() + + labs(fill = "Selection Decision") + + theme( + legend.position = c(.95, .95), + legend.justification = c("right", "top"), + legend.box.just = "right", + legend.margin = margin(6, 6, 6, 6) + ) + diff --git a/data/ccn2019_boruta_scores.rda b/data/ccn2019_boruta_scores.rda new file mode 100644 index 0000000..47aa1a6 --- /dev/null +++ b/data/ccn2019_boruta_scores.rda Binary files differ diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index 0db19ef..e62332e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,14 +1,32 @@ --- output: html_document editor_options: - chunk_output_type: inline + chunk_output_type: console --- ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +options(repos=c(CRAN="http://cran.cnr.berkeley.edu/")) + #! =============================================== #! load required packages -# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) +# if(!require(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))){ +# install.packages(c("ggplot2", +# "tidyverse", +# "stringi", +# "pls", +# "caret", +# "here", +# "tsibble", +# "broom", +# "rsample", +# "inspectdf", +# "caTools", +# "pROC", +# "glmnet", +# "ROSE")) +# } + library(ggplot2) library(tidyverse) library(stringi) @@ -33,9 +51,17 @@ # Preprocessing + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + + +### Extract Lures The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -```{r} +```{r lures_function, include=FALSE} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -47,10 +73,6 @@ } ``` -After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. -The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. - -The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. ```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE} @@ -127,22 +149,24 @@ ```{r split_data} split_data <- function(f, data, .balance=F, p = 0.8) { - train.indices <- createDataPartition(data[[toString(f[[2]])]], + .train.indices <- createDataPartition(data[[toString(f[[2]])]], p = p, list =F) - .train.imbalanced <- data[train.indices,] + .train.imbalanced <- data[.train.indices,] - .train <- ifelse(.balance, - ROSE(f, data = .train.imbalanced)$data, - seqs.train.imbalanced) - - .train.x <- model.matrix(f, seqs.train)[,-1] + # .train <- ifelse(.balance==T, + # ROSE(f, data = .train.imbalanced)$data, + # .train.imbalanced) + .train <- .train.imbalanced - .train.y <- seqs.train[[toString(f[[2]])]] - .test <- seqs[-train.indices,] - .test.x <- model.matrix(f, seqs.test)[,-1] - .test.observed_y <- seqs.test[[toString(f[[2]])]] + .train.x <- model.matrix(f, .train)[,-1] + .train.y <- .train[[toString(f[[2]])]] + + .test <- data[-.train.indices,] + .test.x <- model.matrix(f, .test)[,-1] + .test.observed_y <- .test[[toString(f[[2]])]] + return(list( "train" = .train, "train.x" = .train.x, @@ -162,20 +186,35 @@ ```{r rfe_feature_selection} #f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll -#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat -f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat +f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat #f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat splt <- split_data(f, seqs) ctrl <- rfeControl(functions = rfFuncs, method = "cv", - number = 3, + number = 10, + rerank = T, verbose = T) -rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +#rfeProfile <- rfe(splt$train.x, splt$train.y, +rfeProfile <- rfe(f, seqs, + rfeControl = ctrl, + sizes = 2:10 + ) + + +rfeProfile$results %>% + as.data.frame() %>% + rownames_to_column() %>% + ggplot(aes(x = reorder(2:10, Accuracy), y = Accuracy)) + + geom_bar(stat = "identity", fill = "#1F77B4", alpha = 0.8) + + coord_flip() message("Optimal Variables: ") -rfeProfile$optVariables + +rfeProfile$metric # ROC for each categories filterVarImp(as.data.frame(splt$train.x), splt$train.y) diff --git a/ccn2019/ccn2019.rev3.html b/ccn2019/ccn2019.rev3.html new file mode 100644 index 0000000..fac97cb --- /dev/null +++ b/ccn2019/ccn2019.rev3.html @@ -0,0 +1,855 @@ + + + + + + + + + + + + + +ccn2019.rev3.utf8.md + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Preprocessing

+

After loading data set and marking lure trials, a sliding window of the size window_size will be used to extract statistics including t, l, sl, s, u, ul, tl, a, al for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.

+

The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.

+
+

Extract Lures

+

The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.

+

The following chunk marks highly correlated predictors, but does not work anymore.

+
# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+

split_data() is a function to split the cleaned up dataset into training and test. If .balance parameter is set to TRUE then the training set is oversampled with regard to the formula.

+
split_data <- function(f, data, .balance=F, p = 0.8) {
+  .train.indices <- createDataPartition(data[[toString(f[[2]])]], 
+                                       p = p, 
+                                       list =F)
+  
+  .train.imbalanced <- data[.train.indices,]
+  
+  # .train <- ifelse(.balance==T, 
+  #                  ROSE(f, data = .train.imbalanced)$data,
+  #                  .train.imbalanced)
+  .train <- .train.imbalanced
+
+  .train.x <- model.matrix(f, .train)[,-1]
+  .train.y <- .train[[toString(f[[2]])]]
+  
+  .test  <- data[-.train.indices,]
+  .test.x <-  model.matrix(f, .test)[,-1]
+  .test.observed_y <- .test[[toString(f[[2]])]]
+  
+  return(list(
+    "train" = .train,
+    "train.x" = .train.x,
+    "train.y" = .train.y,
+    "test" = .test,
+    "test.x" = .test.x,
+    "test.observed_y" = .test.observed_y
+  ))
+}
+
+
+

Feature Selection

+

the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat. To change parameters, modify the f formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.

+
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   rerank = T,
+                   verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+
## +(rfe) fit Fold1 size: 10 
+## -(rfe) fit Fold1 size: 10 
+## +(rfe) imp Fold1 
+## -(rfe) imp Fold1 
+## +(rfe) fit Fold1 size:  8 
+## -(rfe) fit Fold1 size:  8 
+## +(rfe) imp Fold1 size:  8 
+## -(rfe) imp Fold1 size:  8 
+## +(rfe) fit Fold1 size:  4 
+## -(rfe) fit Fold1 size:  4 
+## +(rfe) imp Fold1 size:  4 
+## -(rfe) imp Fold1 size:  4 
+## +(rfe) fit Fold2 size: 10 
+## -(rfe) fit Fold2 size: 10 
+## +(rfe) imp Fold2 
+## -(rfe) imp Fold2 
+## +(rfe) fit Fold2 size:  8 
+## -(rfe) fit Fold2 size:  8 
+## +(rfe) imp Fold2 size:  8 
+## -(rfe) imp Fold2 size:  8 
+## +(rfe) fit Fold2 size:  4 
+## -(rfe) fit Fold2 size:  4 
+## +(rfe) imp Fold2 size:  4 
+## -(rfe) imp Fold2 size:  4 
+## +(rfe) fit Fold3 size: 10 
+## -(rfe) fit Fold3 size: 10 
+## +(rfe) imp Fold3 
+## -(rfe) imp Fold3 
+## +(rfe) fit Fold3 size:  8 
+## -(rfe) fit Fold3 size:  8 
+## +(rfe) imp Fold3 size:  8 
+## -(rfe) imp Fold3 size:  8 
+## +(rfe) fit Fold3 size:  4 
+## -(rfe) fit Fold3 size:  4 
+## +(rfe) imp Fold3 size:  4 
+## -(rfe) imp Fold3 size:  4
+
message("Optimal Variables: ")
+
## Optimal Variables:
+
rfeProfile$optVariables
+
## [1] "n"  "tl" "vl" "sl"
+
# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+
##       Overall
+## n  15.6157510
+## t   0.4033237
+## v   3.2527774
+## s   3.6791786
+## l   2.9241600
+## vl  0.1725283
+## sl  2.7719808
+## tl  4.8728706
+## ul  2.9101769
+## ll  3.1461070
+
+
+

Analysis of Correctness

+

Correctness refers to the correct column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).

+

The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.

+

In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.

+

Note: uncomment desired old.f and new.f formulas to run respective analysis.

+
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer 
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+  
+train_model <- function(f, data, ctrl) {
+  splt <- split_data(f, data, .balance = T)
+  model <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               tuneLength = 5,
+               #tuneGrid = tune,
+               trControl = ctrl)
+  list(model  = model,
+       test.y = model %>% predict(splt$test.x),
+       test.y_prob = model %>% predict(splt$test.x, type="prob"),
+       test.observed_y = splt$test.observed_y
+  )
+}
+
+old.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(old.f, ., ctrl)
+
## + Fold1: ncomp=1 
+## - Fold1: ncomp=1 
+## + Fold2: ncomp=1 
+## - Fold2: ncomp=1 
+## + Fold3: ncomp=1 
+## - Fold3: ncomp=1 
+## + Fold4: ncomp=1 
+## - Fold4: ncomp=1 
+## + Fold5: ncomp=1 
+## - Fold5: ncomp=1 
+## Aggregating results
+## Fitting final model on full training set
+
new.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(new.f, ., ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+
bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
confusionMatrix(old.model$test.y, old.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO   93 771
+##        YES  39 771
+##                                           
+##                Accuracy : 0.5161          
+##                  95% CI : (0.4919, 0.5403)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.0579          
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.70455         
+##             Specificity : 0.50000         
+##          Pos Pred Value : 0.10764         
+##          Neg Pred Value : 0.95185         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.05556         
+##    Detection Prevalence : 0.51613         
+##       Balanced Accuracy : 0.60227         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(old.model$model, scale = F, useModel = F))
+

+
confusionMatrix(new.model$test.y, new.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO  103 835
+##        YES  29 707
+##                                           
+##                Accuracy : 0.4839          
+##                  95% CI : (0.4597, 0.5081)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.063           
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.78030         
+##             Specificity : 0.45850         
+##          Pos Pred Value : 0.10981         
+##          Neg Pred Value : 0.96060         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.06153         
+##    Detection Prevalence : 0.56033         
+##       Balanced Accuracy : 0.61940         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(new.model$model, scale = F, useModel = F))
+

+
roc(old.model$test.observed_y,
+    old.model$test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="gray",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+
## Setting direction: controls < cases
+
## 
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55,     lty = 1, of = "se", boot.n = 100)
+## 
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.95     92.93   93.60
+##   20  83.90     85.86   87.19
+##   30  75.86     78.79   80.79
+##   40  67.81     71.72   74.39
+##   50  59.76     64.66   67.99
+##   60  51.31     57.59   61.58
+##   70  39.66     50.52   55.18
+##   80  26.44     34.42   44.81
+##   90  13.22     17.21   22.41
+##  100   0.00      0.00    0.00
+
roc(new.model$test.observed_y,
+    new.model$test.y_prob$YES,
+    legacy.axes=T,
+    add = T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 50,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+

+
## 
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, add = T,     lwd = 2, col = "black", print.auc = T, print.auc.y = 50,     print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+## 
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  95.16     96.29   97.19
+##   20  89.97     92.58   94.37
+##   30  82.85     87.98   91.53
+##   40  74.06     79.70   85.61
+##   50  65.48     70.92   76.26
+##   60  56.18     62.02   67.40
+##   70  45.65     53.38   58.83
+##   80  30.57     42.77   50.22
+##   90  15.29     21.39   30.26
+##  100   0.00      0.00    0.00
+
+
+

RT Analysis

+
f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     #sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               #tuneLength = 7,
+               tuneGrid = tune,
+               trControl = ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Fitting final model on full training set
+
# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  mid high
+##       mid  1435  240
+##       high    0    0
+##                                          
+##                Accuracy : 0.8567         
+##                  95% CI : (0.839, 0.8732)
+##     No Information Rate : 0.8567         
+##     P-Value [Acc > NIR] : 0.5172         
+##                                          
+##                   Kappa : 0              
+##                                          
+##  Mcnemar's Test P-Value : <2e-16         
+##                                          
+##             Sensitivity : 1.0000         
+##             Specificity : 0.0000         
+##          Pos Pred Value : 0.8567         
+##          Neg Pred Value :    NaN         
+##              Prevalence : 0.8567         
+##          Detection Rate : 0.8567         
+##    Detection Prevalence : 1.0000         
+##       Balanced Accuracy : 0.5000         
+##                                          
+##        'Positive' Class : mid            
+## 
+
plot(varImp(model1, scale = F, useModel = F))
+

+
roc(splt$test.observed_y,
+    splt$test.y_prob$mid,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 0,
+    print.auc.x = 80,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
## Setting levels: control = mid, case = high
+
## Setting direction: controls > cases
+

+
## 
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80,     lty = 1, of = "se", boot.n = 200)
+## 
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.25     94.21   96.68
+##   20  86.25     90.00   93.34
+##   30  82.08     86.04   90.48
+##   40  72.68     79.58   85.83
+##   50  67.08     72.92   78.76
+##   60  58.75     66.46   74.60
+##   70  42.91     51.25   58.94
+##   80  27.07     34.17   40.43
+##   90  12.17     16.71   21.67
+##  100   0.00      0.00    0.00
+
+
+ + + + +
+ + + + + + + + diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R index 7da6569..5ae7299 100644 --- a/ccn2019/ccn2019_diagrams.R +++ b/ccn2019/ccn2019_diagrams.R @@ -49,3 +49,70 @@ mutate(condition=ifelse(condition=='2-back',2,3)) %>% inspect_cor(show_plot = T) + + + +base.df <- data.frame(x=100-base.roc$specificities, + y=base.roc$sensitivities, + auc = base.roc$auc[1], + model="base") + +extd.df <- data.frame(x=100-extd.roc$specificities, + y=extd.roc$sensitivities, + auc = extd.roc$auc[1], + model="extended") + +chance.df <- data.frame(x=1:100, y=1:100, model=" ", auc=50) + +library(ggrepel) +dats <- rbind(extd.df, base.df, chance.df) + +to_auc_label <- function(model, auc) { + paste(model, + "\nAUC=", + format(auc, digits=4), + sep = "" + ) +} + +dats$label = NA +dats[174,]$label = to_auc_label("Extended Model", dats[174,]$auc) +dats[647,]$label = to_auc_label("Base Model", dats[647,]$auc) + + +dats %>% + ggplot(aes(x=x, y=y, + group=model, + color = model, + linetype = factor(model))) + + geom_line() + + labs(title="AUCs for the base and extended models") + + geom_label_repel(aes(label=label), na.rm = TRUE, box.padding = 2) + + xlab("100% - Specificity") + + ylab("Sensitivity") + + theme_linedraw() + + #scale_x_continuous(labels = scales::percent) + + #scale_y_continuous(labels = scales::percent) + + scale_fill_brewer(palette = "Greens") + + scale_color_manual(values=c("black", "#808080", "gray")) + + theme(legend.position = "none") + +ggsave("fig1.png", plot = last_plot(), width = 4, height = 4) + +boruta_scores %>% + mutate(feature = row.names(.)) %>% + arrange(meanImp) %>% + ggplot(aes(x=reorder(feature,-meanImp), y=meanImp, fill=decision)) + + geom_bar(stat = "identity") + + ylab("Relative Importance Score") + + xlab("Feature") + + theme_linedraw() + + scale_fill_grey() + + labs(fill = "Selection Decision") + + theme( + legend.position = c(.95, .95), + legend.justification = c("right", "top"), + legend.box.just = "right", + legend.margin = margin(6, 6, 6, 6) + ) + diff --git a/data/ccn2019_boruta_scores.rda b/data/ccn2019_boruta_scores.rda new file mode 100644 index 0000000..47aa1a6 --- /dev/null +++ b/data/ccn2019_boruta_scores.rda Binary files differ diff --git a/data/nback_raw_seqs.Rda b/data/nback_raw_seqs.Rda new file mode 100644 index 0000000..6ca2c1b --- /dev/null +++ b/data/nback_raw_seqs.Rda Binary files differ diff --git a/ccn2019/ccn2019.rev3.Rmd b/ccn2019/ccn2019.rev3.Rmd index 0db19ef..e62332e 100644 --- a/ccn2019/ccn2019.rev3.Rmd +++ b/ccn2019/ccn2019.rev3.Rmd @@ -1,14 +1,32 @@ --- output: html_document editor_options: - chunk_output_type: inline + chunk_output_type: console --- ```{r setup, message=FALSE, include=FALSE, paged.print=FALSE} +options(repos=c(CRAN="http://cran.cnr.berkeley.edu/")) + #! =============================================== #! load required packages -# install.packages(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE")) +# if(!require(c("ggplot2", "tidyverse","stringi", "pls", "caret", "here", "tsibble", "broom", "rsample", "inspectdf", "caTools", "pROC", "glmnet", "ROSE"))){ +# install.packages(c("ggplot2", +# "tidyverse", +# "stringi", +# "pls", +# "caret", +# "here", +# "tsibble", +# "broom", +# "rsample", +# "inspectdf", +# "caTools", +# "pROC", +# "glmnet", +# "ROSE")) +# } + library(ggplot2) library(tidyverse) library(stringi) @@ -33,9 +51,17 @@ # Preprocessing + +After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. +The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. + +The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. + + +### Extract Lures The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before. -```{r} +```{r lures_function, include=FALSE} with_lures <- function(stimulus, stimulus_type, n) { sapply(1:length(stimulus), function(i) { lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1])) @@ -47,10 +73,6 @@ } ``` -After loading data set and marking lure trials, a sliding window of the size `window_size` will be used to extract statistics including `t`, `l`, `sl`, `s`, `u`, `ul`, `tl`, `a`, `al` for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. -The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1. - -The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times. ```{r preprocessing, message=FALSE, warning=FALSE, include=FALSE} @@ -127,22 +149,24 @@ ```{r split_data} split_data <- function(f, data, .balance=F, p = 0.8) { - train.indices <- createDataPartition(data[[toString(f[[2]])]], + .train.indices <- createDataPartition(data[[toString(f[[2]])]], p = p, list =F) - .train.imbalanced <- data[train.indices,] + .train.imbalanced <- data[.train.indices,] - .train <- ifelse(.balance, - ROSE(f, data = .train.imbalanced)$data, - seqs.train.imbalanced) - - .train.x <- model.matrix(f, seqs.train)[,-1] + # .train <- ifelse(.balance==T, + # ROSE(f, data = .train.imbalanced)$data, + # .train.imbalanced) + .train <- .train.imbalanced - .train.y <- seqs.train[[toString(f[[2]])]] - .test <- seqs[-train.indices,] - .test.x <- model.matrix(f, seqs.test)[,-1] - .test.observed_y <- seqs.test[[toString(f[[2]])]] + .train.x <- model.matrix(f, .train)[,-1] + .train.y <- .train[[toString(f[[2]])]] + + .test <- data[-.train.indices,] + .test.x <- model.matrix(f, .test)[,-1] + .test.observed_y <- .test[[toString(f[[2]])]] + return(list( "train" = .train, "train.x" = .train.x, @@ -162,20 +186,35 @@ ```{r rfe_feature_selection} #f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll -#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat -f <- rt ~ n + t + s + v + l + vl + sl + tl + ul + ll# + rt_cat +f <- correct ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat +f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat #f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat splt <- split_data(f, seqs) ctrl <- rfeControl(functions = rfFuncs, method = "cv", - number = 3, + number = 10, + rerank = T, verbose = T) -rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl) + +#rfeProfile <- rfe(splt$train.x, splt$train.y, +rfeProfile <- rfe(f, seqs, + rfeControl = ctrl, + sizes = 2:10 + ) + + +rfeProfile$results %>% + as.data.frame() %>% + rownames_to_column() %>% + ggplot(aes(x = reorder(2:10, Accuracy), y = Accuracy)) + + geom_bar(stat = "identity", fill = "#1F77B4", alpha = 0.8) + + coord_flip() message("Optimal Variables: ") -rfeProfile$optVariables + +rfeProfile$metric # ROC for each categories filterVarImp(as.data.frame(splt$train.x), splt$train.y) diff --git a/ccn2019/ccn2019.rev3.html b/ccn2019/ccn2019.rev3.html new file mode 100644 index 0000000..fac97cb --- /dev/null +++ b/ccn2019/ccn2019.rev3.html @@ -0,0 +1,855 @@ + + + + + + + + + + + + + +ccn2019.rev3.utf8.md + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Preprocessing

+

After loading data set and marking lure trials, a sliding window of the size window_size will be used to extract statistics including t, l, sl, s, u, ul, tl, a, al for each sequence. Here what we acutually do is to convert a dataset of participants to a dataset of sequences. The following code also centers and scales RT for each participant, so average RT for each participant will be 0.0 with SD=1.

+

The new data set is then saved in a file for further analysis. One final chuck of codes clean up the dataset by removing outlier participants and adding RT_CAT which shows high or low response times.

+
+

Extract Lures

+

The following function receives a block of trials, and marks lure trials, which are those stimuli similar to the items presented in N-1 and N+1 trials before.

+

The following chunk marks highly correlated predictors, but does not work anymore.

+
# WIP: This is an extra step for non-pls methods to remove highly correlated predictors
+#cor_matrix <- cor(seqs[,-1])
+#cor_high <- findCorrelation(cor_matrix, 0.8)
+#high_cor_remove <- row.names(cor_matrix)[cor_high]
+#FIXME remove by column name
+#seqs.uncorr <- seqs %>% select(-high_cor_remove)
+

split_data() is a function to split the cleaned up dataset into training and test. If .balance parameter is set to TRUE then the training set is oversampled with regard to the formula.

+
split_data <- function(f, data, .balance=F, p = 0.8) {
+  .train.indices <- createDataPartition(data[[toString(f[[2]])]], 
+                                       p = p, 
+                                       list =F)
+  
+  .train.imbalanced <- data[.train.indices,]
+  
+  # .train <- ifelse(.balance==T, 
+  #                  ROSE(f, data = .train.imbalanced)$data,
+  #                  .train.imbalanced)
+  .train <- .train.imbalanced
+
+  .train.x <- model.matrix(f, .train)[,-1]
+  .train.y <- .train[[toString(f[[2]])]]
+  
+  .test  <- data[-.train.indices,]
+  .test.x <-  model.matrix(f, .test)[,-1]
+  .test.observed_y <- .test[[toString(f[[2]])]]
+  
+  return(list(
+    "train" = .train,
+    "train.x" = .train.x,
+    "train.y" = .train.y,
+    "test" = .test,
+    "test.x" = .test.x,
+    "test.observed_y" = .test.observed_y
+  ))
+}
+
+
+

Feature Selection

+

the following chunk uses cross-validate recursive feature elimination (RFE) to extract best features to descrtibe rt_cat. To change parameters, modify the f formula. In addition to optimal set of variables, this chunk spits out model-free variable importance analysis for each predictor.

+
#f <- rt_cat ~ n + t + s + v + l + vl + sl + tl + ul + ll
+#f <- correct ~ n + t + s + v + l + vl + sl + tl + ul + ll + rt_cat
+f <- rt ~ n + t + v + s + l + vl + sl + tl + ul + ll# + rt_cat
+#f <- a ~ n + t + v + l + vl + sl + tl + ul + ll# + rt_cat
+
+splt <- split_data(f, seqs)
+
+ctrl <- rfeControl(functions = rfFuncs,
+                   method = "cv",
+                   number = 3,
+                   rerank = T,
+                   verbose = T)
+rfeProfile <- rfe(splt$train.x, splt$train.y, rfeControl = ctrl)
+
## +(rfe) fit Fold1 size: 10 
+## -(rfe) fit Fold1 size: 10 
+## +(rfe) imp Fold1 
+## -(rfe) imp Fold1 
+## +(rfe) fit Fold1 size:  8 
+## -(rfe) fit Fold1 size:  8 
+## +(rfe) imp Fold1 size:  8 
+## -(rfe) imp Fold1 size:  8 
+## +(rfe) fit Fold1 size:  4 
+## -(rfe) fit Fold1 size:  4 
+## +(rfe) imp Fold1 size:  4 
+## -(rfe) imp Fold1 size:  4 
+## +(rfe) fit Fold2 size: 10 
+## -(rfe) fit Fold2 size: 10 
+## +(rfe) imp Fold2 
+## -(rfe) imp Fold2 
+## +(rfe) fit Fold2 size:  8 
+## -(rfe) fit Fold2 size:  8 
+## +(rfe) imp Fold2 size:  8 
+## -(rfe) imp Fold2 size:  8 
+## +(rfe) fit Fold2 size:  4 
+## -(rfe) fit Fold2 size:  4 
+## +(rfe) imp Fold2 size:  4 
+## -(rfe) imp Fold2 size:  4 
+## +(rfe) fit Fold3 size: 10 
+## -(rfe) fit Fold3 size: 10 
+## +(rfe) imp Fold3 
+## -(rfe) imp Fold3 
+## +(rfe) fit Fold3 size:  8 
+## -(rfe) fit Fold3 size:  8 
+## +(rfe) imp Fold3 size:  8 
+## -(rfe) imp Fold3 size:  8 
+## +(rfe) fit Fold3 size:  4 
+## -(rfe) fit Fold3 size:  4 
+## +(rfe) imp Fold3 size:  4 
+## -(rfe) imp Fold3 size:  4
+
message("Optimal Variables: ")
+
## Optimal Variables:
+
rfeProfile$optVariables
+
## [1] "n"  "tl" "vl" "sl"
+
# ROC for each categories
+filterVarImp(as.data.frame(splt$train.x), splt$train.y)
+
##       Overall
+## n  15.6157510
+## t   0.4033237
+## v   3.2527774
+## s   3.6791786
+## l   2.9241600
+## vl  0.1725283
+## sl  2.7719808
+## tl  4.8728706
+## ul  2.9101769
+## ll  3.1461070
+
+
+

Analysis of Correctness

+

Correctness refers to the correct column of the dataset, which shows if the response was correct or not, for both targets and non-targets. The following code trains two models to predict correctness of a trial without using current stimulus and its type. It only uses predictors that describes the sliding window, and experimental condition (N, and V, which is the number of unique items presented in the sequence).

+

The goal of this analysis is to find out if statistics that are extracted from previous trials via the sliding window can describe the correctness of choice.

+

In addition to the model comparasion plotd, he final output of this section is a single ROC plot that compares new and old models. The same chunk can be used for other categorical models, including RT_CAT and penalized models.

+

Note: uncomment desired old.f and new.f formulas to run respective analysis.

+
#1. atypical local features lead to longer response time.
+#old.f <- rt_cat ~ n + t + v + rt_cat
+#new.f <- rt_cat ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+old.f <- correct ~ n + t
+new.f <- correct ~ n + t + rt_cat
+
+#old.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll + rt_cat
+
+#2. local features provide a better prediction for the correctness of choice
+#old.f <- correct ~ n + t + v
+#new.f <- correct ~ n + t + tl + l + sl + vl + sl + tl + ul + ll
+
+#3. Longer 
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+  
+train_model <- function(f, data, ctrl) {
+  splt <- split_data(f, data, .balance = T)
+  model <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               tuneLength = 5,
+               #tuneGrid = tune,
+               trControl = ctrl)
+  list(model  = model,
+       test.y = model %>% predict(splt$test.x),
+       test.y_prob = model %>% predict(splt$test.x, type="prob"),
+       test.observed_y = splt$test.observed_y
+  )
+}
+
+old.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(old.f, ., ctrl)
+
## + Fold1: ncomp=1 
+## - Fold1: ncomp=1 
+## + Fold2: ncomp=1 
+## - Fold2: ncomp=1 
+## + Fold3: ncomp=1 
+## - Fold3: ncomp=1 
+## + Fold4: ncomp=1 
+## - Fold4: ncomp=1 
+## + Fold5: ncomp=1 
+## - Fold5: ncomp=1 
+## Aggregating results
+## Fitting final model on full training set
+
new.model <- seqs %>%
+  #filter(rt_cat == "high") %>%
+  train_model(new.f, ., ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Selecting tuning parameters
+## Fitting ncomp = 1 on full training set
+
bwplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
densityplot(resamples(list(old=old.model$model, new=new.model$model)))
+

+
confusionMatrix(old.model$test.y, old.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO   93 771
+##        YES  39 771
+##                                           
+##                Accuracy : 0.5161          
+##                  95% CI : (0.4919, 0.5403)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.0579          
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.70455         
+##             Specificity : 0.50000         
+##          Pos Pred Value : 0.10764         
+##          Neg Pred Value : 0.95185         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.05556         
+##    Detection Prevalence : 0.51613         
+##       Balanced Accuracy : 0.60227         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(old.model$model, scale = F, useModel = F))
+

+
confusionMatrix(new.model$test.y, new.model$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  NO YES
+##        NO  103 835
+##        YES  29 707
+##                                           
+##                Accuracy : 0.4839          
+##                  95% CI : (0.4597, 0.5081)
+##     No Information Rate : 0.9211          
+##     P-Value [Acc > NIR] : 1               
+##                                           
+##                   Kappa : 0.063           
+##                                           
+##  Mcnemar's Test P-Value : <2e-16          
+##                                           
+##             Sensitivity : 0.78030         
+##             Specificity : 0.45850         
+##          Pos Pred Value : 0.10981         
+##          Neg Pred Value : 0.96060         
+##              Prevalence : 0.07885         
+##          Detection Rate : 0.06153         
+##    Detection Prevalence : 0.56033         
+##       Balanced Accuracy : 0.61940         
+##                                           
+##        'Positive' Class : NO              
+## 
+
plot(varImp(new.model$model, scale = F, useModel = F))
+

+
roc(old.model$test.observed_y,
+    old.model$test.y_prob$YES,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="gray",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 40,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+
## Setting direction: controls < cases
+
## 
+## Call:
+## roc.default(response = old.model$test.observed_y, predictor = old.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "gray", print.auc = T, print.auc.y = 40, print.auc.x = 55,     lty = 1, of = "se", boot.n = 100)
+## 
+## Data: old.model$test.y_prob$YES in 132 controls (old.model$test.observed_y NO) < 1542 cases (old.model$test.observed_y YES).
+## Area under the curve: 60.23%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.95     92.93   93.60
+##   20  83.90     85.86   87.19
+##   30  75.86     78.79   80.79
+##   40  67.81     71.72   74.39
+##   50  59.76     64.66   67.99
+##   60  51.31     57.59   61.58
+##   70  39.66     50.52   55.18
+##   80  26.44     34.42   44.81
+##   90  13.22     17.21   22.41
+##  100   0.00      0.00    0.00
+
roc(new.model$test.observed_y,
+    new.model$test.y_prob$YES,
+    legacy.axes=T,
+    add = T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 50,
+    print.auc.x = 55,
+    lty = 1,
+    of = "se",
+    boot.n = 100,
+    ci = T)
+
## Setting levels: control = NO, case = YES
+## Setting direction: controls < cases
+

+
## 
+## Call:
+## roc.default(response = new.model$test.observed_y, predictor = new.model$test.y_prob$YES,     percent = T, ci = T, plot = T, legacy.axes = T, add = T,     lwd = 2, col = "black", print.auc = T, print.auc.y = 50,     print.auc.x = 55, lty = 1, of = "se", boot.n = 100)
+## 
+## Data: new.model$test.y_prob$YES in 132 controls (new.model$test.observed_y NO) < 1542 cases (new.model$test.observed_y YES).
+## Area under the curve: 65.52%
+## 95% CI (100 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  95.16     96.29   97.19
+##   20  89.97     92.58   94.37
+##   30  82.85     87.98   91.53
+##   40  74.06     79.70   85.61
+##   50  65.48     70.92   76.26
+##   60  56.18     62.02   67.40
+##   70  45.65     53.38   58.83
+##   80  30.57     42.77   50.22
+##   90  15.29     21.39   30.26
+##  100   0.00      0.00    0.00
+
+
+

RT Analysis

+
f <- rt_cat ~ n + t + v
+f <- rt_cat ~ n + v + s + tl
+
+splt <- split_data(f, seqs)
+
+ctrl <- trainControl(method="cv",
+                     number=5, 
+                     classProbs=T,
+                     verbose = T,
+                     #sampling = "up",
+                     savePredictions = T,
+                     summaryFunction=twoClassSummary)
+
+max_components <- n_distinct(attr(terms(f),"term.labels")) - 2
+
+# grid tune
+tune <- expand.grid(ncomp=2:max_components)
+
+model1 <- train(splt$train.x,
+               splt$train.y, 
+               method = "pls",
+               family = "binomial",
+               metric = "ROC",
+               preProc = c("nzv","center", "scale"),
+               verboseIter = TRUE,
+               #tuneLength = 7,
+               tuneGrid = tune,
+               trControl = ctrl)
+
## + Fold1: ncomp=2 
+## - Fold1: ncomp=2 
+## + Fold2: ncomp=2 
+## - Fold2: ncomp=2 
+## + Fold3: ncomp=2 
+## - Fold3: ncomp=2 
+## + Fold4: ncomp=2 
+## - Fold4: ncomp=2 
+## + Fold5: ncomp=2 
+## - Fold5: ncomp=2 
+## Aggregating results
+## Fitting final model on full training set
+
# validate model1
+splt$test.y <- model1 %>% predict(splt$test.x)
+splt$test.y_prob <- model1 %>% predict(splt$test.x, type="prob")
+
+confusionMatrix(splt$test.y, splt$test.observed_y)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction  mid high
+##       mid  1435  240
+##       high    0    0
+##                                          
+##                Accuracy : 0.8567         
+##                  95% CI : (0.839, 0.8732)
+##     No Information Rate : 0.8567         
+##     P-Value [Acc > NIR] : 0.5172         
+##                                          
+##                   Kappa : 0              
+##                                          
+##  Mcnemar's Test P-Value : <2e-16         
+##                                          
+##             Sensitivity : 1.0000         
+##             Specificity : 0.0000         
+##          Pos Pred Value : 0.8567         
+##          Neg Pred Value :    NaN         
+##              Prevalence : 0.8567         
+##          Detection Rate : 0.8567         
+##    Detection Prevalence : 1.0000         
+##       Balanced Accuracy : 0.5000         
+##                                          
+##        'Positive' Class : mid            
+## 
+
plot(varImp(model1, scale = F, useModel = F))
+

+
roc(splt$test.observed_y,
+    splt$test.y_prob$mid,
+    legacy.axes=T,
+    plot = T,
+    lwd=2,
+    col="black",
+    print.auc=T,
+    percent = T,
+    print.auc.y = 0,
+    print.auc.x = 80,
+    lty = 1,
+    of = "se",
+    boot.n = 200,
+    ci = T)
+
## Setting levels: control = mid, case = high
+
## Setting direction: controls > cases
+

+
## 
+## Call:
+## roc.default(response = splt$test.observed_y, predictor = splt$test.y_prob$mid,     percent = T, ci = T, plot = T, legacy.axes = T, lwd = 2,     col = "black", print.auc = T, print.auc.y = 0, print.auc.x = 80,     lty = 1, of = "se", boot.n = 200)
+## 
+## Data: splt$test.y_prob$mid in 1435 controls (splt$test.observed_y mid) > 240 cases (splt$test.observed_y high).
+## Area under the curve: 64.24%
+## 95% CI (200 stratified bootstrap replicates):
+##   sp se.low se.median se.high
+##    0 100.00    100.00  100.00
+##   10  91.25     94.21   96.68
+##   20  86.25     90.00   93.34
+##   30  82.08     86.04   90.48
+##   40  72.68     79.58   85.83
+##   50  67.08     72.92   78.76
+##   60  58.75     66.46   74.60
+##   70  42.91     51.25   58.94
+##   80  27.07     34.17   40.43
+##   90  12.17     16.71   21.67
+##  100   0.00      0.00    0.00
+
+
+ + + + +
+ + + + + + + + diff --git a/ccn2019/ccn2019_diagrams.R b/ccn2019/ccn2019_diagrams.R index 7da6569..5ae7299 100644 --- a/ccn2019/ccn2019_diagrams.R +++ b/ccn2019/ccn2019_diagrams.R @@ -49,3 +49,70 @@ mutate(condition=ifelse(condition=='2-back',2,3)) %>% inspect_cor(show_plot = T) + + + +base.df <- data.frame(x=100-base.roc$specificities, + y=base.roc$sensitivities, + auc = base.roc$auc[1], + model="base") + +extd.df <- data.frame(x=100-extd.roc$specificities, + y=extd.roc$sensitivities, + auc = extd.roc$auc[1], + model="extended") + +chance.df <- data.frame(x=1:100, y=1:100, model=" ", auc=50) + +library(ggrepel) +dats <- rbind(extd.df, base.df, chance.df) + +to_auc_label <- function(model, auc) { + paste(model, + "\nAUC=", + format(auc, digits=4), + sep = "" + ) +} + +dats$label = NA +dats[174,]$label = to_auc_label("Extended Model", dats[174,]$auc) +dats[647,]$label = to_auc_label("Base Model", dats[647,]$auc) + + +dats %>% + ggplot(aes(x=x, y=y, + group=model, + color = model, + linetype = factor(model))) + + geom_line() + + labs(title="AUCs for the base and extended models") + + geom_label_repel(aes(label=label), na.rm = TRUE, box.padding = 2) + + xlab("100% - Specificity") + + ylab("Sensitivity") + + theme_linedraw() + + #scale_x_continuous(labels = scales::percent) + + #scale_y_continuous(labels = scales::percent) + + scale_fill_brewer(palette = "Greens") + + scale_color_manual(values=c("black", "#808080", "gray")) + + theme(legend.position = "none") + +ggsave("fig1.png", plot = last_plot(), width = 4, height = 4) + +boruta_scores %>% + mutate(feature = row.names(.)) %>% + arrange(meanImp) %>% + ggplot(aes(x=reorder(feature,-meanImp), y=meanImp, fill=decision)) + + geom_bar(stat = "identity") + + ylab("Relative Importance Score") + + xlab("Feature") + + theme_linedraw() + + scale_fill_grey() + + labs(fill = "Selection Decision") + + theme( + legend.position = c(.95, .95), + legend.justification = c("right", "top"), + legend.box.just = "right", + legend.margin = margin(6, 6, 6, 6) + ) + diff --git a/data/ccn2019_boruta_scores.rda b/data/ccn2019_boruta_scores.rda new file mode 100644 index 0000000..47aa1a6 --- /dev/null +++ b/data/ccn2019_boruta_scores.rda Binary files differ diff --git a/data/nback_raw_seqs.Rda b/data/nback_raw_seqs.Rda new file mode 100644 index 0000000..6ca2c1b --- /dev/null +++ b/data/nback_raw_seqs.Rda Binary files differ diff --git a/data/nback_seqs.Rd b/data/nback_seqs.Rd deleted file mode 100644 index 6ca2c1b..0000000 --- a/data/nback_seqs.Rd +++ /dev/null Binary files differ