diff --git a/py/20200116_efo_analysis.py b/py/20200116_efo_analysis.py index 6e1ee0f..348e7b1 100644 --- a/py/20200116_efo_analysis.py +++ b/py/20200116_efo_analysis.py @@ -1,5 +1,11 @@ +#%% [markdown] + +# The following cell prepares the environment, install the dependencies, and loads the tidy dataset as `df` + #%% -#!pip install arviz +#!pip install pymc3 pandas matplotlib numpy seaborn arviz + +import matplotlib.pyplot as plt import pandas as pd import numpy as np @@ -11,6 +17,8 @@ from IPython.display import display import seaborn as sns +from scipy.stats import zscore + sns.set_style('ticks') az.style.use("arviz-darkgrid") @@ -19,7 +27,6 @@ df = pd.read_csv(csv_path) -#DEBUG report tasks and concepts with zero hits no_hits = df[(df.construct_hits==0) | (df.task_hits==0) | (df.hits==0)] tasks_with_no_hits = no_hits[no_hits.task_hits==0].task.unique() constructs_with_no_hits = no_hits[no_hits.construct_hits==0].construct.unique() @@ -28,9 +35,6 @@ print(f"The following {len(tasks_with_no_hits)} tasks have zero hits: {', '.join(tasks_with_no_hits)}") -#DEBUG remove zero-hits -df = df[(df.construct_hits>0) | (df.task_hits>0) | (df.hits>0)] - # change task and concepts to categories df['context'] = df['context'].astype('category') df['construct'] = df['construct'].astype('category') @@ -39,6 +43,7 @@ #DEBUG make it one observation per task! it this necessary? #df = df.sort_values('task', ascending=False).drop_duplicates(['task']) +df = df[~((df.task_hits ==0) | (df.construct_hits==0))] context_idx = df.context.cat.codes.values n_context = df.context.cat.categories.size @@ -51,81 +56,108 @@ n_obs = len(df) +df['hits_zscore'] = zscore(df.hits) +df['task_hits_zscore'] = zscore(df.task_hits) +df['task_construct_zscore'] = zscore(df.construct_hits) + #%% -# model 1 +# Model1 tries to predict/explain the task/construct hits using only an informative normal distribution + with pm.Model() as model1: - #pm.glm.GLM.from_formula('hits ~ (construct_hits + context) * task_hits', df) - mu = pm.Normal('mu', mu=0, sigma=1) - hits = pm.Normal('hits', mu=mu, sigma=1, observed=df.hits) + # prior on mean value + mu = pm.Normal('mu', sigma=100) + + # likelihood + hits = pm.Normal('hits', mu=mu, sigma=1, observed=df.hits_zscore) display(pm.model_to_graphviz(model1)) display(model1.check_test_point()) - trace = pm.sample(model=model1) - pm.traceplot(trace) + model1_trace = pm.sample(1000, model=model1) +# plot traceplot +az.plot_trace(model1_trace) - -#%% model 2 +#%% model 2: Model task-construct hits with priors on the context, then compare EF to nonEF. # ----------------------------------------------- -sd_contexts = df.groupby('context').std().hits -hits_grand_mean = df.hits.mean() - +sd_contexts = df.groupby('context').std().hits_zscore +hits_grand_mean = df.hits_zscore.mean() #FIXME isn't it zero?! with pm.Model() as model2: - context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context) + # prior on `nu`, a parameter of final StudentT likelihood + nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1 + nu = pm.Deterministic('nu', nu_minus_one + 1) + + # context parameters + context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context) context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context) - nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1 - nu = pm.Deterministic('nu', nu_minus_one+1) + # difference between the avergae hits in EF and nonEF + ef_noef_diff = pm.Deterministic('EF - nonEF', context_mu[0] - context_mu[1]) - hits = pm.StudentT('hits', nu=nu, mu=context_mu[context_idx], sigma=context_sigma[context_idx], observed=df.hits) - - ef_noef_diff = pm.Deterministic('EF/notEF diff', context_mu[0] - context_mu[1]) + # likelihood + hits = pm.StudentT('hits', nu=nu, mu=context_mu[context_idx], sigma=context_sigma[context_idx], observed=df.hits_zscore) display(pm.model_to_graphviz(model2)) display(model2.check_test_point()) - trace = pm.sample(model=model2) + model2_trace = pm.sample(model=model2) - pm.traceplot(trace, var_names=['context_mu','context_sigma']) - - pm.plot_posterior(trace, - var_names=['context_mu','context_sigma','EF/notEF diff'], + # plots + pm.traceplot(model2_trace, var_names=['context_mu','context_sigma','mi']) + pm.plot_posterior(model2_trace, + var_names=['context_mu','context_sigma','EF - nonEF','nu'], point_estimate='mode', credible_interval=0.95, ref_val=0) -#%% TODO: model 3 +#%% Model 3: predict `task-construct` hits, only in EF context, and then compare `construct` model parameters # ----------------------------------------------- -ef_construct_mean = df.groupby('context').std().construct_hits['EF'] +ef_df = df[df.context=='EF'] +ef_construct_mean = ef_df.groupby('construct').mean().construct_hits -display(ef_construct_mean) -df[df.context=='EF'].groupby('construct').sum() -#%% +ef_construct_idx = ef_df.construct.cat.codes.values +n_ef_constructs = ef_df.construct.cat.categories.size + +ef_task_idx = ef_df.task.cat.codes.values +n_ef_tasks = ef_df.task.cat.categories.size + with pm.Model() as model3: - context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context) - context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context) - - construct_mu = pm.Normal('construct_mu', mu=hits_grand_mean, sigma=sd_constructs*100, shape=n_constructs) - constructs_sigma = pm.Uniform('construct_sigma', sd_constructs/1000, sd_constructs*1000, shape=n_constructs) + + construct_mu = pm.Normal('construct_mu', mu=ef_construct_mean, sigma=100, shape=n_ef_constructs) + constructs_sigma = pm.Uniform('construct_sigma', 1/100, 100, shape=n_ef_constructs) nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1 nu = pm.Deterministic('nu', nu_minus_one+1) - hits = pm.StudentT('hits', nu=nu, mu=construct_mu[construct_idx], sigma=constructs_sigma[construct_idx], observed=df.hits) - - ef_noef_diff = pm.Deterministic('constructs diff (EXAMPLE)', construct_mu[0] - construct_mu[1]) + hits = pm.StudentT('hits', nu=nu, mu=construct_mu[ef_construct_idx], sigma=constructs_sigma[ef_construct_idx], observed=ef_df.hits) display(pm.model_to_graphviz(model3)) display(model3.check_test_point()) - trace = pm.sample(model=model3) + + model3_trace = pm.sample(model=model3) - #pm.traceplot(trace, var_names=['construct_mu','constructs_sigma','constructs diff (EXAMPLE)']) - pm.plot_posterior(trace, - var_names=['construct_mu','construct_sigma','constructs diff (EXAMPLE)'], + display(pm.model_to_graphviz(model3)) + + display(pm.summary(model3_trace)) + pm.forestplot(model3_trace, combined=True, var_names=['construct_mu'], credible_interval=0.95) + + pm.traceplot(model3_trace) + pm.plot_posterior(model3_trace, point_estimate='mode', credible_interval=0.95, ref_val=0) +az.plot_pair(model3_trace, + var_names=['construct_mu'], + divergences=True, + textsize=18) + +#%% Model4 + + +#%% Model Comparision: compare models using WAIC criterion +models = {'model1':model1, 'model2': model2, 'model3': model3} +mc_waic = pm.compare(models, ic='WAIC') +pm.compareplot(mc_waic)