diff --git a/py/20200116_efo_analysis.py b/py/20200116_efo_analysis.py index e69de29..4a65c45 100644 --- a/py/20200116_efo_analysis.py +++ b/py/20200116_efo_analysis.py @@ -0,0 +1,130 @@ +#%% +#!pip install arviz + +import pandas as pd +import numpy as np + +import pandas as pd +import pymc3 as pm +import numpy as np + +from IPython.display import display +import seaborn as sns + +sns.set_style('ticks') +az.style.use("arviz-darkgrid") + +#csv_path = '/content/drive/My Drive/Colab Notebooks/data/efo_pubmed_hits.20200114_preproc.csv' +csv_path = '/Users/morteza/workspace/efo_kickoff/datasets/efo_pubmed_hits.20200114_tidy.csv' + +df = pd.read_csv(csv_path) + +#DEBUG report tasks and concepts with zero hits +no_hits = df[(df.construct_hits==0) | (df.task_hits==0) | (df.hits==0)] +tasks_with_no_hits = no_hits[no_hits.task_hits==0].task.unique() +constructs_with_no_hits = no_hits[no_hits.construct_hits==0].construct.unique() + +print(f"The following {len(constructs_with_no_hits)} constructs have zero hits: {', '.join(constructs_with_no_hits)}") +print(f"The following {len(tasks_with_no_hits)} tasks have zero hits: {', '.join(tasks_with_no_hits)}") + + +#DEBUG remove zero-hits +df = df[(df.construct_hits>0) | (df.task_hits>0) | (df.hits>0)] + +# change task and concepts to categories +df['context'] = df['context'].astype('category') +df['construct'] = df['construct'].astype('category') +df['task'] = df['task'].astype('category') + +#DEBUG make it one observation per task! it this necessary? +#df = df.sort_values('task', ascending=False).drop_duplicates(['task']) + + +context_idx = df.context.cat.codes.values +n_context = df.context.cat.categories.size + +construct_idx = df.construct.cat.codes.values +n_constructs = df.construct.cat.categories.size + +task_idx = df.task.cat.codes.values +n_tasks = df.task.cat.categories.size + +n_obs = len(df) + +#%% +# model 1 + +with pm.Model() as model1: + #pm.glm.GLM.from_formula('hits ~ (construct_hits + context) * task_hits', df) + mu = pm.Normal('mu', mu=0, sigma=1) + hits = pm.Normal('hits', mu=mu, sigma=1, observed=df.hits) + + display(pm.model_to_graphviz(model1)) + display(model1.check_test_point()) + trace = pm.sample(model=model1) + + pm.traceplot(trace) + + + +#%% model 2 +# ----------------------------------------------- + +sd_contexts = df.groupby('context').std().hits +hits_grand_mean = df.hits.mean() + + +with pm.Model() as model2: + context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context) + + context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context) + + nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1 + nu = pm.Deterministic('nu', nu_minus_one+1) + + hits = pm.StudentT('hits', nu=nu, mu=context_mu[context_idx], sigma=context_sigma[context_idx], observed=df.hits) + + ef_noef_diff = pm.Deterministic('EF/notEF diff', context_mu[0] - context_mu[1]) + + display(pm.model_to_graphviz(model2)) + display(model2.check_test_point()) + trace = pm.sample(model=model2) + + pm.traceplot(trace, var_names=['context_mu','context_sigma']) + + pm.plot_posterior(trace, + var_names=['context_mu','context_sigma','EF/notEF diff'], + point_estimate='mode', + credible_interval=0.95, + ref_val=0) + +#%% TODO: model 3 +# ----------------------------------------------- +sd_constructs = df.groupby('construct').std().construct_hits + + +with pm.Model() as model3: + context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context) + context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context) + + construct_mu = pm.Normal('construct_mu', mu=hits_grand_mean, sigma=sd_constructs*100, shape=n_constructs) + constructs_sigma = pm.Uniform('construct_sigma', sd_constructs/1000, sd_constructs*1000, shape=n_constructs) + + nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1 + nu = pm.Deterministic('nu', nu_minus_one+1) + + hits = pm.StudentT('hits', nu=nu, mu=construct_mu[construct_idx], sigma=constructs_sigma[construct_idx], observed=df.hits) + + ef_noef_diff = pm.Deterministic('constructs diff (EXAMPLE)', construct_mu[0] - construct_mu[1]) + + display(pm.model_to_graphviz(model3)) + display(model3.check_test_point()) + trace = pm.sample(model=model3) + + #pm.traceplot(trace, var_names=['construct_mu','constructs_sigma','constructs diff (EXAMPLE)']) + + pm.plot_posterior(trace, + var_names=['construct_mu','constructs_sigma','constructs diff (EXAMPLE)'], + point_estimate='mode', + credible_interval=0.95, + ref_val=0) \ No newline at end of file