#%% #!pip install arviz import pandas as pd import numpy as np import pandas as pd import pymc3 as pm import numpy as np from IPython.display import display import seaborn as sns sns.set_style('ticks') az.style.use("arviz-darkgrid") #csv_path = '/content/drive/My Drive/Colab Notebooks/data/efo_pubmed_hits.20200114_preproc.csv' csv_path = '/Users/morteza/workspace/efo_kickoff/datasets/efo_pubmed_hits.20200114_tidy.csv' df = pd.read_csv(csv_path) #DEBUG report tasks and concepts with zero hits no_hits = df[(df.construct_hits==0) | (df.task_hits==0) | (df.hits==0)] tasks_with_no_hits = no_hits[no_hits.task_hits==0].task.unique() constructs_with_no_hits = no_hits[no_hits.construct_hits==0].construct.unique() print(f"The following {len(constructs_with_no_hits)} constructs have zero hits: {', '.join(constructs_with_no_hits)}") print(f"The following {len(tasks_with_no_hits)} tasks have zero hits: {', '.join(tasks_with_no_hits)}") #DEBUG remove zero-hits df = df[(df.construct_hits>0) | (df.task_hits>0) | (df.hits>0)] # change task and concepts to categories df['context'] = df['context'].astype('category') df['construct'] = df['construct'].astype('category') df['task'] = df['task'].astype('category') #DEBUG make it one observation per task! it this necessary? #df = df.sort_values('task', ascending=False).drop_duplicates(['task']) context_idx = df.context.cat.codes.values n_context = df.context.cat.categories.size construct_idx = df.construct.cat.codes.values n_constructs = df.construct.cat.categories.size task_idx = df.task.cat.codes.values n_tasks = df.task.cat.categories.size n_obs = len(df) #%% # model 1 with pm.Model() as model1: #pm.glm.GLM.from_formula('hits ~ (construct_hits + context) * task_hits', df) mu = pm.Normal('mu', mu=0, sigma=1) hits = pm.Normal('hits', mu=mu, sigma=1, observed=df.hits) display(pm.model_to_graphviz(model1)) display(model1.check_test_point()) trace = pm.sample(model=model1) pm.traceplot(trace) #%% model 2 # ----------------------------------------------- sd_contexts = df.groupby('context').std().hits hits_grand_mean = df.hits.mean() with pm.Model() as model2: context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context) context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context) nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1 nu = pm.Deterministic('nu', nu_minus_one+1) hits = pm.StudentT('hits', nu=nu, mu=context_mu[context_idx], sigma=context_sigma[context_idx], observed=df.hits) ef_noef_diff = pm.Deterministic('EF/notEF diff', context_mu[0] - context_mu[1]) display(pm.model_to_graphviz(model2)) display(model2.check_test_point()) trace = pm.sample(model=model2) pm.traceplot(trace, var_names=['context_mu','context_sigma']) pm.plot_posterior(trace, var_names=['context_mu','context_sigma','EF/notEF diff'], point_estimate='mode', credible_interval=0.95, ref_val=0) #%% TODO: model 3 # ----------------------------------------------- ef_construct_mean = df.groupby('context').std().construct_hits['EF'] display(ef_construct_mean) df[df.context=='EF'].groupby('construct').sum() #%% with pm.Model() as model3: context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context) context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context) construct_mu = pm.Normal('construct_mu', mu=hits_grand_mean, sigma=sd_constructs*100, shape=n_constructs) constructs_sigma = pm.Uniform('construct_sigma', sd_constructs/1000, sd_constructs*1000, shape=n_constructs) nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1 nu = pm.Deterministic('nu', nu_minus_one+1) hits = pm.StudentT('hits', nu=nu, mu=construct_mu[construct_idx], sigma=constructs_sigma[construct_idx], observed=df.hits) ef_noef_diff = pm.Deterministic('constructs diff (EXAMPLE)', construct_mu[0] - construct_mu[1]) display(pm.model_to_graphviz(model3)) display(model3.check_test_point()) trace = pm.sample(model=model3) #pm.traceplot(trace, var_names=['construct_mu','constructs_sigma','constructs diff (EXAMPLE)']) pm.plot_posterior(trace, var_names=['construct_mu','construct_sigma','constructs diff (EXAMPLE)'], point_estimate='mode', credible_interval=0.95, ref_val=0)