Newer
Older
notebooks / py / 20200116_efo_analysis.py
#%% [markdown]

# The following cell prepares the environment, install the dependencies, and loads the tidy dataset as `df`

#%%
#!pip install pymc3 pandas matplotlib numpy seaborn arviz

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import pandas as pd
import pymc3 as pm
import numpy as np

from IPython.display import display
import seaborn as sns

from scipy.stats import zscore

sns.set_style('ticks')
az.style.use("arviz-darkgrid")

#csv_path = '/content/drive/My Drive/Colab Notebooks/data/efo_pubmed_hits.20200114_preproc.csv'
csv_path = '/Users/morteza/workspace/efo_kickoff/datasets/efo_pubmed_hits.20200114_tidy.csv'

df = pd.read_csv(csv_path)

no_hits = df[(df.construct_hits==0) | (df.task_hits==0) | (df.hits==0)]
tasks_with_no_hits = no_hits[no_hits.task_hits==0].task.unique()
constructs_with_no_hits = no_hits[no_hits.construct_hits==0].construct.unique()

print(f"The following {len(constructs_with_no_hits)} constructs have zero hits: {', '.join(constructs_with_no_hits)}")
print(f"The following {len(tasks_with_no_hits)} tasks have zero hits: {', '.join(tasks_with_no_hits)}")


# change task and concepts to categories
df['context'] = df['context'].astype('category')
df['construct'] = df['construct'].astype('category')
df['task'] = df['task'].astype('category')

#DEBUG make it one observation per task! it this necessary?
#df = df.sort_values('task', ascending=False).drop_duplicates(['task'])

df = df[~((df.task_hits ==0) | (df.construct_hits==0))]

context_idx = df.context.cat.codes.values
n_context = df.context.cat.categories.size

construct_idx = df.construct.cat.codes.values
n_constructs = df.construct.cat.categories.size

task_idx = df.task.cat.codes.values
n_tasks = df.task.cat.categories.size

n_obs = len(df)

df['hits_zscore'] = zscore(df.hits)
df['task_hits_zscore'] = zscore(df.task_hits)
df['task_construct_zscore'] = zscore(df.construct_hits)

#%%
# Model1 tries to predict/explain the task/construct hits using only an informative normal distribution


with pm.Model() as model1:
  # prior on mean value
  mu = pm.Normal('mu', sigma=100)

  # likelihood
  hits = pm.Normal('hits', mu=mu, sigma=1, observed=df.hits_zscore)

  display(pm.model_to_graphviz(model1))
  display(model1.check_test_point())

  model1_trace = pm.sample(1000, model=model1)

# plot traceplot
az.plot_trace(model1_trace)

#%% model 2: Model task-construct hits with priors on the context, then compare EF to nonEF.
# -----------------------------------------------

sd_contexts = df.groupby('context').std().hits_zscore
hits_grand_mean = df.hits_zscore.mean() #FIXME isn't it zero?!

with pm.Model() as model2:

  # prior on `nu`, a parameter of final StudentT likelihood
  nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1
  nu = pm.Deterministic('nu', nu_minus_one + 1)

  # context parameters
  context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context)
  context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context)
  
  # difference between the avergae hits in EF and nonEF
  ef_noef_diff = pm.Deterministic('EF - nonEF', context_mu[0] - context_mu[1])

  # likelihood
  hits = pm.StudentT('hits', nu=nu, mu=context_mu[context_idx], sigma=context_sigma[context_idx], observed=df.hits_zscore)

  display(pm.model_to_graphviz(model2))
  display(model2.check_test_point())
  model2_trace = pm.sample(model=model2)

  # plots
  pm.traceplot(model2_trace, var_names=['context_mu','context_sigma','mi'])
  pm.plot_posterior(model2_trace,
    var_names=['context_mu','context_sigma','EF - nonEF','nu'],
    point_estimate='mode',
    credible_interval=0.95,
    ref_val=0)

#%% Model 3: predict `task-construct` hits, only in EF context, and then compare `construct` model parameters 
# -----------------------------------------------
ef_df = df[df.context=='EF']
ef_construct_mean = ef_df.groupby('construct').mean().construct_hits

ef_construct_idx = ef_df.construct.cat.codes.values
n_ef_constructs = ef_df.construct.cat.categories.size

ef_task_idx = ef_df.task.cat.codes.values
n_ef_tasks = ef_df.task.cat.categories.size

with pm.Model() as model3:

  construct_mu = pm.Normal('construct_mu', mu=ef_construct_mean, sigma=100, shape=n_ef_constructs)
  constructs_sigma = pm.Uniform('construct_sigma', 1/100, 100, shape=n_ef_constructs)

  nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1
  nu = pm.Deterministic('nu', nu_minus_one+1)

  hits = pm.StudentT('hits', nu=nu, mu=construct_mu[ef_construct_idx], sigma=constructs_sigma[ef_construct_idx], observed=ef_df.hits)

  display(pm.model_to_graphviz(model3))
  display(model3.check_test_point())
  
  model3_trace = pm.sample(model=model3)

  display(pm.model_to_graphviz(model3))

  display(pm.summary(model3_trace))
  pm.forestplot(model3_trace, combined=True, var_names=['construct_mu'], credible_interval=0.95)
  
  pm.traceplot(model3_trace)
  pm.plot_posterior(model3_trace,
    point_estimate='mode',
    credible_interval=0.95,
    ref_val=0)
az.plot_pair(model3_trace,
            var_names=['construct_mu'],
            divergences=True,
            textsize=18)

#%% Model4


#%% Model Comparision: compare models using WAIC criterion
models = {'model1':model1, 'model2': model2, 'model3': model3}
mc_waic = pm.compare(models, ic='WAIC')
pm.compareplot(mc_waic)