notebooks/ccn2019.rev0.Rmd at 2f57994d2a0917de78cead3c05fac8f2d55075af

Fork: 0
morteza / notebooks
Find file
Newer
Older
notebooks / ccn2019.rev0.Rmd
Morteza Ansarinia on 23 May 2019 14 KB caret, AUC, etc for CCN2019
Raw Blame History
---
title: "Statistical Properties of the N-Back Sequences"
output:
  html_notebook: default
  pdf_document: default
editor_options: 
  chunk_output_type: inline
---

# Problems

Statistical properties of n-back sequences bias behaviors. These bias, under specified structure, allows multiple cognitive strategies, producing heterogeneous behavior in a "gold standard" cognitive task.

# Gaps

- Unclear how to parameterize interesting variations for sequence generation
- How do we model these multiple strategies (which requires identifying which sequence variations matter) 
    - local vs. global properties, which one matters the most?
    - Local:  lumpiness, short sequence patterns -> could be exploited by “reactive”/automaticity 
    - Global:  No lures, large vocabulary -> pattern repeats implies a target


## Formulating Generating the N-Back Sequences as a CSP instance

$P=\langle V,D,C,W\rangle$

$V=\{x_N,x_{T},x_{T,local},x_L,x_{L,local},x_V,x_U,x_S,x_{S,local},x_G\}$

$D=\{\}$


Constraints:

$$
\\

x_n = N, W_n = 1 - |10 \times dnorm(x_n-N,sd=4)|

\\\\

x_t = T \times trials, W_t = 1 - |10\times dnorm(T\times trials-x_t,sd=4)|

\\\\

x_{tl} = {T \times w \over trials}, W_{tl} = 1 - |10\times dnorm(x_{tl} - {T \over trials} \times w,sd=4)|

\\\\

x_{l} = L \times trials
\\\\

x_{ll} = L \times w
\\\\

x_{v} = |V|
\\

x_{ul} = w
\\\\

x_{s} = {trials \over |V|}
\\\\

x_{sl} = max(1, {w \over |V|})
\\\\

x_{g} = {trials \over w}

\\\\

x_{vl} = min(|V|, w)
$$

```{r libraries, message=FALSE, include=FALSE, paged.print=FALSE}
library(ggplot2)
library(tidyverse)
library(stringi)
library(plsRglm)
library(plsdof)
library(caret)
```

```{r params}
load('./data/CL2015.RData')

window_size <- 8
```



```{r history}

with_lures <- function(stimulus, stimulus_type, n) {
  sapply(1:length(stimulus), function(i) {
    lures <- c(as.character(stimulus[i-n-1]), as.character(stimulus[i-n+1]))
    are_valid_trials <- i>n && all(!is.na(c(lures,stimulus[i])))
    ifelse(are_valid_trials && stimulus[i] %in% lures,
           "lure", 
           as.character(stimulus_type[i]))
  })
}

with_history <- function(stimuli, length=16, fixed=F) {
  seq <- paste(stimuli, collapse = '')
  
  sapply(1:length(stimuli), function(i) {
    stri_reverse(str_sub(seq, max(1,i-length+1), i))
  })
  #ifelse(fixed, h[str_length(h)==size], h)
}

# $x_{s,local}$
with_skewness <- function(history) {
  sapply(history, function(h) {
    freqs <- table(unlist(str_split(h,"")))
    sum(sort(freqs, decreasing = T)[1:2]) - 1
  })
}

# $x_{u,local}$
with_lumpiness <- function(history) {
  sapply(history, function(h) {
    freqs <- table(unlist(str_split(h,"")))
    max(freqs) - 1
  })
}


with_targets_ratio <- function(stimulus_type, length=16) {
  sapply(1:length(stimulus_type), function(i) {
    trials <- stimulus_type[max(1,i-length):i]
    length(trials[trials=="target"]) / length(trials)
  })
}

with_lures_ratio <- function(stimulus_type, length=16) {
  sapply(1:length(stimulus_type), function(i) {
    trials <- stimulus_type[max(1,i-length):i]
    length(trials[trials=="lure"]) / length(trials)
  })
}

#TODO change to list column workflow with broom for model fitting and evaluating the fits
# duh! we are using list nested insided a tibble, so put all new columns in a new list column
# instead of adding a new column for each.
NB2 <- NB %>%
  group_by(participant, condition, block) %>%
  nest() %>% unnest(data) %>%
  mutate(n = ifelse(condition=='2-back',2,3)) %>%
  mutate(stimulus_type = with_lures(stimulus, stimulus_type, n)) %>%
  mutate(history = with_history(stimulus, window_size)) %>%
  mutate(x_sl = with_skewness(history)) %>%
  mutate(x_ul = with_lumpiness(history)) %>%
  mutate(x_t = with_targets_ratio(stimulus_type, window_size)) %>%
  mutate(x_l = with_lures_ratio(stimulus_type, window_size)) %>%
  ungroup()
  
pca <- prcomp(~x_sl+x_ul+x_t+x_l, NB2, center = TRUE,scale. = TRUE, na.action=na.exclude)
NB2 <- NB2 %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])

# caret
library(caret)
# Compile cross-validation settings


any(is.na(NB2))
NB2 <- na.omit(NB2)

# set.seed(100)
# trainingfold <- createMultiFolds(NB2@correct, k = 5, times = 10)
# 
# # PLS
# mod1 <- train(correct ~ ., data = NB2[,c("correct","x_sl","x_ul","x_t","x_l")],
#  method = "pls",
#  metric = "Accuracy",
#  tuneLength = 20,
#  trControl = trainControl("repeatedcv", index = trainingfold, selectionFunction = "oneSE"),
#  preProc = c("zv","center","scale"))
#  
# # Check CV
# plot(mod1)


plsResult <- plsR(rt ~ ., data=NB2[,c("rt","x_sl","x_ul","x_t","x_l")],3)
plsResult <- plsR(correct ~ ., data=NB2[,c("correct","x_sl","x_ul","x_t","x_l")],3)


plsResult
```

<!--chapter:end:ccn2019.rev2.Rmd-->

---
title: "Evaluating N-Back Sequences"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
library(tidyverse)
library(ggplot2)
library(stringi)
library(GA)
library(dbscan)
library(inspectdf)

load('./data/CL2015.RData')
```

### Variables
- $T$ number of targets
- $L$ number of lures
- $S$ Skewness score
- $U$ Uniformity (!repetition)
- $RT_{mean}$
- $Accuracy_{mean}$
- $dprime$
- $criterion$

## Constraints

- fixed number of targets
- fixed number of lures (a.k.a, foils)
- uniform distribution of choices
- controlled local lumpiness


Each constraint is an up side down quadratic function to be minimized.

```{r, eval=F}
targets_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
lures_cost <- function(x) 1.0 - 10*dnorm(x,mean=0,sd=4)
skewness_cost <- function(x, choices) {
  uniform_ratio <- length(x) / length(choices)
  deviation_from_uniform <- setNames(vector('numeric', length(choices)), choices)
  for (c in choices) {
    deviation_from_uniform[c] = abs(length(x[x==c]) - uniform_ratio)
  }
  #TODO convert to gaussian loss
  max(deviation_from_uniform)
}

lumpiness_cost <- function(x, choices) {
  #trials = len(seq)
  #freqs = [float(seq.count(c)) for c in choices]
  #ralph_skewed = sum(heapq.nlargest(int(len(choices) / 2), freqs)) > (trials * 2 / 3)
  #return ralph_skewed
  NA
}

#merged_cost <- function(x) targets_fitness(x) + lures_fitness(x) + skewness_fitness(x)
#GA <- ga(type = "real-valued", fitness = merged_cost, lower = -10, upper = 10)
#plot(GA)
```


```{r}

with_lures <- function(condition, stim, stim_type, history = NA) {
  sapply(1:length(stim), 
    function(i) {
      switch(as.character(condition[i]),
             "2-back" = {
               ifelse(
                 stim[i]==stri_sub(history[i],-2,-2) ||  stim[i]==stri_sub(history[i],-4,-4), 
                 'lure', 
                 as.character(stim_type[i])
               )},
             "3-back" = {
               ifelse(
                 stim[i]==stri_sub(history[i],-3,-3) ||  stim[i]==stri_sub(history[i],-5,-5), 
                 'lure', 
                 as.character(stim_type[i])
               )}
      )

    })
}

with_targets_ratio <- function(stimulus_type, history) {
  sapply(1:length(history), function(i) {
    trials <- stimulus_type[(i-stri_length(history[i])):i]
    trials <- unlist(trials, use.names=FALSE)
    length(trials[trials=="target"])
  })
}

with_lures_ratio <- function(stimulus_type, history) {
  sapply(1:length(history), function(i) {
    trials <- stimulus_type[(i-str_length(history[i])):i]
    trials <- unlist(trials, use.names=FALSE)
    length(trials[trials=="lure"])
  })
}

with_lumpiness_score <- function(stimulus, history) {
  sapply(1:length(history), function(i) {
    trials <- stimulus[(i-str_length(history[i])):i]
    trials <- unlist(trials, use.names=FALSE)
    max(table(trials)) - 1
  })
}

with_lag <- function(stimulus, history) {
  # find last occurance the of stimulus
}

with_skewness_score <- function(stimulus, history) {
  sapply(1:length(history), function(i) {
    trials <- stimulus[(i-str_length(history[i])):i]
    trials <- unlist(trials, use.names=FALSE)
    sum(sort(table(trials), decreasing = T)[1:2]) - 1
  })
}

with_history <- function(stims, size=16) {
  res <- c('')
  for (i in 2:length(stims)) {
    res[i] <- stri_sub(paste(res[i-1], stims[i], sep=''),from=-size,length=size)
  }
  #res <- ifelse(stri_length(res)==size, res, NA)
  res
}

normalize_scores <- function(targets_ratio, lures_ratio, skewness, lumpiness) {
  #TODO
  sapply(1:length(targets_ratio), function(i) 0)
}

window_size <- 8

NB_modified <- NB %>%
  group_by(participant, condition, block) %>%
  mutate(history = with_history(stimulus, size=window_size)) %>%
  #mutate(stimulus_type = map_chr(.x=stimulus, stim_type=stimulus_type, history=history,.f=with_lures))
  mutate(stimulus_type_2 = with_lures(condition, stimulus, stimulus_type, history)) %>%
  mutate(targets = with_targets_ratio(stimulus_type_2, history)) %>%
  mutate(lures = with_lures_ratio(stimulus_type_2, history)) %>%
  mutate(skewness = with_skewness_score(stimulus, history)) %>%
  mutate(lumpiness = with_lumpiness_score(stimulus, history)) %>%
  filter(stri_length(history)==window_size) %>%
  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
    #normalize_scores(targets_ratio, lures_ratio, skewness, lumpiness) %>%
  ungroup()

pca <- prcomp(NB_modified[,c('targets','lures','skewness','lumpiness')], center = TRUE,scale. = TRUE)

NB_modified <- NB_modified %>% mutate(pc1=pca$x[,'PC1'], pc2=pca$x[,'PC2'])


## participant-level averaged NB, a single row represent an observation for a single subject
## in a single condition
NB_avg <- NB_modified %>%
  group_by(participant, condition) %>%
  mutate(correct = ifelse(stimulus_type=='burn-in',NA,correct)) %>%
  summarise(
    targets=sum(targets), 
    lures=sum(lures), 
    skewness=sum(skewness), 
    lumpiness=sum(lumpiness), 
    rt = mean(rt, na.rm=T), 
    correct=sum(correct,na.rm=T)/90) %>%
  ungroup()

# print
# NB_modified %>%
#   filter(participant=='P1') %>%
#   View()
#   
  

fit <- lm(correct ~ t * s * u * l * d, NB_modified)

```


```{r}

# DBSCAN Clustering (RT+ACCURACY against skewness)
NB_avg <- NB_avg %>%
  mutate(cluster = dbscan(cbind(correct,rt), eps = 0.3, minPts = 3)$cluster)

NB_avg %>%
  ggplot(aes(skewness, correct, color=factor(cluster))) +
  ggtitle(" clusters (window = 16 trials)", "NOTE: each point is a single participant") +
  geom_point(alpha=0.3) +
  #geom_smooth(method='lm', se = F) +
  facet_wrap(~condition)

```


```{r}
## single-subject figures
NB_modified %>%
  ggplot(aes(t,s,color=correct)) +
  geom_jitter() +
  geom_point() +
  stat_summary(fun.y="mean")

NB_modified %>%
  inspect_cor(show_plot = T)

NB_avg %>%
  inspect_cor(show_plot = T)

NB_modified %>%
  ggplot(aes(rt,correct,color=u)) +
  geom_jitter() +
  geom_point() +
  stat_summary(fun.y="mean")

NB_modified %>%
  filter(!is.na(correct)) %>%
  ggplot(aes(jitter(s),jitter(t),color=correct)) +
    geom_jitter() +
    geom_point(alpha=0.1)

NB_modified %>%
  filter(!is.na(correct)) %>%
  ggplot(aes(jitter(s),jitter(u),color=correct)) +
    geom_jitter() +
    geom_point() +
    facet_wrap(~condition)

# rt/accuracy and lures
NB_modified %>%
  filter(!is.na(correct)) %>%
  ggplot(aes(jitter(l),rt,color=correct,alpha=0.01)) +
    geom_jitter() +
    geom_point(shape=16) +
    geom_smooth(method="lm",se = F) + 
    facet_wrap(~condition, scales="free")


NB_modified %>%
  filter(!is.na(correct)) %>%
  ggplot(aes(pc1,pc2,color=correct)) +
    geom_point() +
    geom_smooth(method="lm",se = F) + 
    facet_wrap(~condition, scales="free")

NB_modified %>%
  filter(!is.na(correct)) %>%
  ggplot(aes(pc1,rt,color=correct)) +
    geom_point(alpha=0.3) +
    geom_smooth(method="lm",se = F) + 
    facet_wrap(~condition, scales="free")

NB_modified %>%
  filter(!is.na(correct)) %>%
  ggplot(aes(pc1,correct,color=correct)) +
    geom_point() +
    geom_smooth(method="lm",se = F) + 
    facet_wrap(~condition, scales="free")


```

## TODO
  - data %>% mutate(constraint1=fitness1(history), constrain2=fitness2(history),
  - constraint3=fitness(history))
  - kmeans(NB)
  - ggplot(kmeans_clusters$accuracy) 
  - ggplot(kmeans_clusters$rt)
  - 




```{python}
a=2
p
```

<!--chapter:end:ccn2019.Rmd-->

---
title: "PLS Training"
output: html_notebook
---

PLS:


```{r}
#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
library(tidyverse)
library(pls)

## 1. load sample data
#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")

rm(NB)
load("./data/CL2015.RData")
data <- NB
str(data)

## 2. clean data (remove brand and URLID)
data <- data %>% 
  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
  select(-condition,
         -stimulus,
         -block,
         -trial)
# %>%
#   rename(
#          ev.participant=participant,
#          ev.n=n,
#          ev.block=block,
#          ev.stimulus_type=stimulus_type,
#          rv.choice=choice,
#          rv.rt=rt,
#          rv.correct=correct
#          )

## 3. use cross validatation to find the optimal number of dimensions
pls.model = plsr(rt ~ ., data = data, validation = "CV")

## 3.1. find the model with lowest cv error
cv <- RMSEP(pls.model)
best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1

## 4. rebuild the model
pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)

## 5. Sort, and visualize top coefficients
coefs <- coef(pls.model)

barplot(sort(coefs[,1,1], decreasing = T)[1:4])
```


```{r simulate}
X <- matrix(rnorm(1100), 100, 11)
Y <- matrix(rnorm(400), 100, 4)

pls.model <- plsr(Y ~ X, validation = "CV")

cv <- RMSEP(pls.model)
best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
pls.model <- plsr(Y ~ X, ncomp = best_dims)
coefs <- sort(coef(pls.model)[,1,1], decreasing = T)

barplot(coefs)

```


```{r cca-simulate}
X <- matrix(rnorm(1100), 100, 11)
Y <- matrix(rnorm(400), 100, 4)

M <- cor(cbind(X,Y))
corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
cc <- cancor(X, Y)

#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)

```


```{r plsrglm}
rm(list = ls())
library(plsRglm)

data(Cornell)
df <- Cornell
x <- subset(df, select = -c(Y))
y <- df$Y
## K is the number of folds in CV, and nt is the maximum number of components, 
#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)

modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
res.cv.modpls<-cvtable(summary(cv.modpls))

res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)

```

<!--chapter:end:pls_playground.Rmd-->