Newer
Older
notebooks / ccn2019 / pls_playground.Rmd
---
title: "PLS Training"
output: html_notebook
editor_options: 
  chunk_output_type: console
---

PLS:


```{r}
#detach("package:MASS","plsdof") # to avoid conflict with dplyr::select
library(tidyverse)
library(pls)

## 1. load sample data
#data <- read.csv("http://wiki.q-researchsoftware.com/images/d/db/Stacked_colas.csv")

rm(NB)
load("./data/CL2015.RData")
data <- NB
str(data)

## 2. clean data (remove brand and URLID)
data <- data %>% 
  mutate(n=ifelse(condition=='2-back', 2, 3)) %>%
  select(-condition,
         -stimulus,
         -block,
         -trial)
# %>%
#   rename(
#          ev.participant=participant,
#          ev.n=n,
#          ev.block=block,
#          ev.stimulus_type=stimulus_type,
#          rv.choice=choice,
#          rv.rt=rt,
#          rv.correct=correct
#          )

## 3. use cross validatation to find the optimal number of dimensions
pls.model = plsr(rt ~ ., data = data, validation = "CV")

## 3.1. find the model with lowest cv error

best_dims <- which.min(RMSEP(pls.model)$val[estimate = "adjCV", , ]) - 1

## 4. rebuild the model
pls.model <- plsr(rt ~ ., data = data, ncomp = best_dims)

## 5. Sort, and visualize top coefficients
coefs <- coef(pls.model)

barplot(sort(coefs[,1,1], decreasing = T)[1:4])
```


```{r simulate}
X <- matrix(rnorm(1100), 100, 11)
Y <- matrix(rnorm(400), 100, 4)

pls.model <- plsr(Y ~ X, validation = "CV")

cv <- RMSEP(pls.model)
best_dims <- which.min(cv$val[estimate = "adjCV", , ]) - 1
pls.model <- plsr(Y ~ X, ncomp = best_dims)
coefs <- sort(coef(pls.model)[,1,1], decreasing = T)

barplot(coefs)

```


```{r cca-simulate}
X <- matrix(rnorm(1100), 100, 11)
Y <- matrix(rnorm(400), 100, 4)

M <- cor(cbind(X,Y))
corrplot(M, method="ellipse", order="hclust", addrect=2, addCoef.col="black")
cc <- cancor(X, Y)

#NB: cc <- cancor(cbind(rt,correct, accuracy) ~ xt + xl + xtl, data = data)

```


```
rm(list = ls())
library(plsRglm)

data(Cornell)
df <- Cornell
x <- subset(df, select = -c(Y))
y <- df$Y
## K is the number of folds in CV, and nt is the maximum number of components, 
#cv.modpls<-cv.plsRglm(dataY=y,dataX=x ,nt=10,modele="pls-glm-logistic",K=8)

modpls <- plsRglm(dataY = y,dataX = x, nt = 10, modele = "pls-glm-logistic", sparse=TRUE,sparseStop=TRUE)
res.cv.modpls<-cvtable(summary(cv.modpls))

res6<-plsR(Y~.,data=Cornell, nt=6, typeVC="missing", pvals.expli=TRUE)

```