diff --git a/ccn2019.rev3.Rmd b/ccn2019.rev3.Rmd index 9f38d9b..7a879be 100644 --- a/ccn2019.rev3.Rmd +++ b/ccn2019.rev3.Rmd @@ -13,6 +13,7 @@ library(rsample) library(inspectdf) library(caTools) +library(pROC) ``` @@ -45,12 +46,14 @@ mutate(tl = slide2_dbl(stimulus_type, rt, ~length(which(.x=='target'))/length(which(!is.na(.y))), .partial=T,.size=window_size), ll = slide2_dbl(stimulus_type, rt, ~length(which(.x=='lure'))/length(which(!is.na(.y))), .partial=T, .size=window_size), sl = slide_dbl(stimulus_type, ~sum(sort(table(.x), decreasing = T)[1:2]) - 1, .partial=T, .size=window_size), + ul = slide_dbl(stimulus, ~max(table(.))-1, .partial=T, .size=window_size), + vl = slide_dbl(stimulus, ~length(unique(.)), .partial=T, .size=window_size), + al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size), sl = ifelse(is.na(sl), 0, sl), tl = ifelse(is.na(tl), NA, tl), ll = ifelse(is.na(ll), NA, ll), - ul = slide_dbl(stimulus, ~max(table(.))-1, .partial=T, .size=window_size), - vl = slide_dbl(stimulus, ~length(unique(.)), .partial=T, .size=window_size), - al = slide2_dbl(correct, rt, ~length(which(.x))/length(which(!is.na(.y))), .partial=T, .size=window_size)) %>% + al = ifelse(is.na(al), NA, al) +) %>% nest(.key='local_stats') %>% #mutate(stimuli = map(local_stats, ~paste0(.x$stimulus,collapse = ''))) %>% mutate(a = map_dbl(local_stats, ~length(which(.x$correct)))) %>% @@ -60,29 +63,41 @@ mutate(v = map_dbl(local_stats, ~length(unique(.x$stimulus)))) %>% mutate(local_stats = map(local_stats, ~.x %>% select(-trial,-stimulus,-stimulus_type,-choice))) %>% ungroup() %>% - select(-participant,-block,-condition) + select(-participant,-block,-condition) %>% + unnest(local_stats) #! =============================================== #! visualize correlations -inspect_cor(seqs %>% unnest(local_stats), show_plot = T) +#DEBUG inspect_cor(seqs, show_plot = T) ``` ```{r models} #! =============================================== #! prepare data for modeling (remove na, etc) -#! it also restructures "correct" column to avoid caret errors. C stands for "CORRECT", and I is "INCORRECT" -data <- seqs %>% - unnest(local_stats) %>% - mutate(correct=factor(as.numeric(correct),labels=c("C","I"))) %>% - filter(!is.na(correct), !is.na(rt)) +seqs <- seqs %>% + filter(!is.na(correct), !is.na(rt)) %>% + #mutate(correct=factor(as.numeric(correct),labels=c("I","C"))) + mutate(correct=as.numeric(correct)) + +#FIXME remove outcomes before dummy out the data and imputing missing values + +# replace factors with dummy data +#seqs.dummy <- predict(dummyVars(~.,data=seqs[,-1]),seqs[,-1]) + +# impute missing values +#seqs.imputed <- predict(preProcess(seqs.dummy, "bagImpute"), seqs.dummy) +#DEBUG View(seqs.imputed) #! =============================================== -#! Prepare train and test partials -shuff <- sample(nrow(data)) -split <- nrow(data) * 0.8 +#! split into train/test (consider class-inbalence for the correct) +set.seed(42) +train_indexes <- createDataPartition(seqs$correct, + times = 1, + p = 0.7, + list = F) -train_data <- data[1:split,] -test_data <- data[(split+1):nrow(data),] +seqs.train <- seqs[train_indexes,] +seqs.test <- seqs[-train_indexes,] #! =============================================== #! training parameters for the PLS models @@ -92,28 +107,28 @@ ) #==================================================# -# Train PLS model (accuracy) -model_pls_accuracy <- train( +# Train PLS model (block-level accuracy) +pls.fit.accuracy <- train( a ~ .-rt-al-correct, data = train_data, method = "pls", tuneLength = 20, trControl = plsTrControl, - preProc = c("zv","center","scale")) + preProc = c("center","scale")) # Check CV profile -plot(model_pls_accuracy) +plot(pls.fit.accuracy) # PLS variable importance -plot(varImp(model_pls_accuracy), main="Accuracy - Variable Importance") +plot(varImp(pls.fit.accuracy), main="Accuracy - Variable Importance") #==================================================# # Train PLS model (rt) -train_data_x <- data %>% select(-rt,-a,-correct) -train_data_y <- (data %>% select(rt))$rt +train_data_x <- train_data %>% select(-rt,-a,-correct) # all except rt +train_data_y <- (train_data %>% select(rt))$rt # only rt -model_pls_rt <- train( +pls.fit.rt <- train( train_data_x, train_data_y, method = "pls", @@ -122,47 +137,47 @@ preProc = c("center","scale")) # Check CV profile -plot(model_pls_rt) +plot(pls.fit.rt) # PLS variable importance -plot(varImp(model_pls_rt), main="RT - Variable Importance") +plot(varImp(pls.fit.rt), main="RT - Variable Importance") -predicted_rt_data <- predict(model_pls_rt, test_data) +pls.predicted.rt <- predict(pls.fit.rt, test_data) #FIXME -confusionMatrix(predicted_rt_data,test_data$rt) -colAUC(predicted_rt_data,test_data$rt, plotROC=T) +confusionMatrix(pls.predicted.rt,test_data$rt) +colAUC(pls.predicted.rt,test_data$rt, plotROC=T) #==================================================# # training control params for "correct" column -trControl <- trainControl( +glmTrControl <- trainControl( method = "cv", number = 5, classProbs = T, summaryFunction = twoClassSummary ) -model_glm_correct <- train( +glm.fit.correct <- train( correct ~ .-rt-a-al, data = train_data, method = "glm", family = "binomial", - trControl = trControl + trControl = glmTrControl ) -model_glm_correct -varImp(model_glm_correct) +glm.fit.correct +varImp(glm.fit.correct) -predicted_correct_data <- predict(model_glm_correct, test_data, type="prob") +glm.predicted.correct <- predict(glm.fit.correct, test_data, type="prob") #FIXME -confusionMatrix(predicted_correct_data, test_data$correct) -colAUC(predicted_correct_data, test_data$correct, plotROC=T) +confusionMatrix(glm.predicted.correct, test_data$correct) +colAUC(glm.predicted.correct, test_data$correct, plotROC=T) #==================================================# ## OLD MODEL (only global features) -model_glm_correct_old <- train( +glm.fit.correct.old <- train( correct ~ n+t+v, data = train_data, method = "glm", @@ -170,11 +185,12 @@ trControl = trControl ) -model_glm_correct_old +glm.fit.correct.old -predicted_old_correct_data <- predict(model_glm_correct_old, test_data, type="prob") +glm.fit.predicted.old <- predict(glm.fit.correct.old, test_data, type="prob") #FIXME -confusionMatrix(test_data$correct, predicted_old_correct_data) -colAUC(predicted_old_correct_data,test_data$correct, plotROC=T) +confusionMatrix(glm.fit.predicted.old, test_data$correct, ) +colAUC(glm.fit.predicted.old, test_data$correct, plotROC=T) +#TODO use pROC to viz AUX roc(test_data$correct,glm.fit.predicted.old) ```