#%% [markdown]
# The following cell prepares the environment, install the dependencies, and loads the tidy dataset as `df`
#%%
#!pip install pymc3 pandas matplotlib numpy seaborn arviz
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pandas as pd
import pymc3 as pm
import numpy as np
from IPython.display import display
import seaborn as sns
from scipy.stats import zscore
sns.set_style('ticks')
az.style.use("arviz-darkgrid")
#csv_path = '/content/drive/My Drive/Colab Notebooks/data/efo_pubmed_hits.20200114_preproc.csv'
csv_path = '/Users/morteza/workspace/efo_kickoff/datasets/efo_pubmed_hits.20200114_tidy.csv'
df = pd.read_csv(csv_path)
no_hits = df[(df.construct_hits==0) | (df.task_hits==0) | (df.hits==0)]
tasks_with_no_hits = no_hits[no_hits.task_hits==0].task.unique()
constructs_with_no_hits = no_hits[no_hits.construct_hits==0].construct.unique()
print(f"The following {len(constructs_with_no_hits)} constructs have zero hits: {', '.join(constructs_with_no_hits)}")
print(f"The following {len(tasks_with_no_hits)} tasks have zero hits: {', '.join(tasks_with_no_hits)}")
# change task and concepts to categories
df['context'] = df['context'].astype('category')
df['construct'] = df['construct'].astype('category')
df['task'] = df['task'].astype('category')
#DEBUG make it one observation per task! it this necessary?
#df = df.sort_values('task', ascending=False).drop_duplicates(['task'])
df = df[~((df.task_hits ==0) | (df.construct_hits==0))]
context_idx = df.context.cat.codes.values
n_context = df.context.cat.categories.size
construct_idx = df.construct.cat.codes.values
n_constructs = df.construct.cat.categories.size
task_idx = df.task.cat.codes.values
n_tasks = df.task.cat.categories.size
n_obs = len(df)
df['hits_zscore'] = zscore(df.hits)
df['task_hits_zscore'] = zscore(df.task_hits)
df['task_construct_zscore'] = zscore(df.construct_hits)
#%%
# Model1 tries to predict/explain the task/construct hits using only an informative normal distribution
with pm.Model() as model1:
# prior on mean value
mu = pm.Normal('mu', sigma=100)
# likelihood
hits = pm.Normal('hits', mu=mu, sigma=1, observed=df.hits_zscore)
display(pm.model_to_graphviz(model1))
display(model1.check_test_point())
model1_trace = pm.sample(1000, model=model1)
# plot traceplot
az.plot_trace(model1_trace)
#%% model 2: Model task-construct hits with priors on the context, then compare EF to nonEF.
# -----------------------------------------------
sd_contexts = df.groupby('context').std().hits_zscore
hits_grand_mean = df.hits_zscore.mean() #FIXME isn't it zero?!
with pm.Model() as model2:
# prior on `nu`, a parameter of final StudentT likelihood
nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1
nu = pm.Deterministic('nu', nu_minus_one + 1)
# context parameters
context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context)
context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context)
# difference between the avergae hits in EF and nonEF
ef_noef_diff = pm.Deterministic('EF - nonEF', context_mu[0] - context_mu[1])
# likelihood
hits = pm.StudentT('hits', nu=nu, mu=context_mu[context_idx], sigma=context_sigma[context_idx], observed=df.hits_zscore)
display(pm.model_to_graphviz(model2))
display(model2.check_test_point())
model2_trace = pm.sample(model=model2)
# plots
pm.traceplot(model2_trace, var_names=['context_mu','context_sigma','mi'])
pm.plot_posterior(model2_trace,
var_names=['context_mu','context_sigma','EF - nonEF','nu'],
point_estimate='mode',
credible_interval=0.95,
ref_val=0)
#%% Model 3: predict `task-construct` hits, only in EF context, and then compare `construct` model parameters
# -----------------------------------------------
ef_df = df[df.context=='EF']
ef_construct_mean = ef_df.groupby('construct').mean().construct_hits
ef_construct_idx = ef_df.construct.cat.codes.values
n_ef_constructs = ef_df.construct.cat.categories.size
ef_task_idx = ef_df.task.cat.codes.values
n_ef_tasks = ef_df.task.cat.categories.size
with pm.Model() as model3:
construct_mu = pm.Normal('construct_mu', mu=ef_construct_mean, sigma=100, shape=n_ef_constructs)
constructs_sigma = pm.Uniform('construct_sigma', 1/100, 100, shape=n_ef_constructs)
nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1
nu = pm.Deterministic('nu', nu_minus_one+1)
hits = pm.StudentT('hits', nu=nu, mu=construct_mu[ef_construct_idx], sigma=constructs_sigma[ef_construct_idx], observed=ef_df.hits)
display(pm.model_to_graphviz(model3))
display(model3.check_test_point())
model3_trace = pm.sample(model=model3)
display(pm.model_to_graphviz(model3))
display(pm.summary(model3_trace))
pm.forestplot(model3_trace, combined=True, var_names=['construct_mu'], credible_interval=0.95)
pm.traceplot(model3_trace)
pm.plot_posterior(model3_trace,
point_estimate='mode',
credible_interval=0.95,
ref_val=0)
az.plot_pair(model3_trace,
var_names=['construct_mu'],
divergences=True,
textsize=18)
#%% Model4
#%% Model Comparision: compare models using WAIC criterion
models = {'model1':model1, 'model2': model2, 'model3': model3}
mc_waic = pm.compare(models, ic='WAIC')
pm.compareplot(mc_waic)