Newer
Older
notebooks / py / 20200116_efo_analysis.py
Morteza Ansarinia on 17 Jan 2020 4 KB mdoel3
#%%
#!pip install arviz

import pandas as pd
import numpy as np

import pandas as pd
import pymc3 as pm
import numpy as np

from IPython.display import display
import seaborn as sns

sns.set_style('ticks')
az.style.use("arviz-darkgrid")

#csv_path = '/content/drive/My Drive/Colab Notebooks/data/efo_pubmed_hits.20200114_preproc.csv'
csv_path = '/Users/morteza/workspace/efo_kickoff/datasets/efo_pubmed_hits.20200114_tidy.csv'

df = pd.read_csv(csv_path)

#DEBUG report tasks and concepts with zero hits
no_hits = df[(df.construct_hits==0) | (df.task_hits==0) | (df.hits==0)]
tasks_with_no_hits = no_hits[no_hits.task_hits==0].task.unique()
constructs_with_no_hits = no_hits[no_hits.construct_hits==0].construct.unique()

print(f"The following {len(constructs_with_no_hits)} constructs have zero hits: {', '.join(constructs_with_no_hits)}")
print(f"The following {len(tasks_with_no_hits)} tasks have zero hits: {', '.join(tasks_with_no_hits)}")


#DEBUG remove zero-hits
df = df[(df.construct_hits>0) | (df.task_hits>0) | (df.hits>0)]

# change task and concepts to categories
df['context'] = df['context'].astype('category')
df['construct'] = df['construct'].astype('category')
df['task'] = df['task'].astype('category')

#DEBUG make it one observation per task! it this necessary?
#df = df.sort_values('task', ascending=False).drop_duplicates(['task'])


context_idx = df.context.cat.codes.values
n_context = df.context.cat.categories.size

construct_idx = df.construct.cat.codes.values
n_constructs = df.construct.cat.categories.size

task_idx = df.task.cat.codes.values
n_tasks = df.task.cat.categories.size

n_obs = len(df)

#%%
# model 1

with pm.Model() as model1:
  #pm.glm.GLM.from_formula('hits ~ (construct_hits + context) * task_hits', df)
  mu = pm.Normal('mu', mu=0, sigma=1)
  hits = pm.Normal('hits', mu=mu, sigma=1, observed=df.hits)

  display(pm.model_to_graphviz(model1))
  display(model1.check_test_point())
  trace = pm.sample(model=model1)

  pm.traceplot(trace)



#%% model 2
# -----------------------------------------------

sd_contexts = df.groupby('context').std().hits
hits_grand_mean = df.hits.mean()


with pm.Model() as model2:
  context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context)

  context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context)
  
  nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1
  nu = pm.Deterministic('nu', nu_minus_one+1)

  hits = pm.StudentT('hits', nu=nu, mu=context_mu[context_idx], sigma=context_sigma[context_idx], observed=df.hits)

  ef_noef_diff = pm.Deterministic('EF/notEF diff', context_mu[0] - context_mu[1])

  display(pm.model_to_graphviz(model2))
  display(model2.check_test_point())
  trace = pm.sample(model=model2)

  pm.traceplot(trace, var_names=['context_mu','context_sigma'])

  pm.plot_posterior(trace,
    var_names=['context_mu','context_sigma','EF/notEF diff'],
    point_estimate='mode',
    credible_interval=0.95,
    ref_val=0)

#%% TODO: model 3
# -----------------------------------------------
ef_construct_mean = df.groupby('context').std().construct_hits['EF']

display(ef_construct_mean)
df[df.context=='EF'].groupby('construct').sum()
#%%
with pm.Model() as model3:
  context_mu = pm.Normal('context_mu', mu=hits_grand_mean, sigma=sd_contexts*100, shape=n_context)
  context_sigma = pm.Uniform('context_sigma', sd_contexts/1000, sd_contexts*1000, shape=n_context)
  
  construct_mu = pm.Normal('construct_mu', mu=hits_grand_mean, sigma=sd_constructs*100, shape=n_constructs)
  constructs_sigma = pm.Uniform('construct_sigma', sd_constructs/1000, sd_constructs*1000, shape=n_constructs)

  nu_minus_one = pm.Exponential('nu_minus_one',lam=1/29.0) # min(nu) must be 1
  nu = pm.Deterministic('nu', nu_minus_one+1)

  hits = pm.StudentT('hits', nu=nu, mu=construct_mu[construct_idx], sigma=constructs_sigma[construct_idx], observed=df.hits)

  ef_noef_diff = pm.Deterministic('constructs diff (EXAMPLE)', construct_mu[0] - construct_mu[1])

  display(pm.model_to_graphviz(model3))
  display(model3.check_test_point())
  trace = pm.sample(model=model3)

  #pm.traceplot(trace, var_names=['construct_mu','constructs_sigma','constructs diff (EXAMPLE)'])
  pm.plot_posterior(trace,
    var_names=['construct_mu','construct_sigma','constructs diff (EXAMPLE)'],
    point_estimate='mode',
    credible_interval=0.95,
    ref_val=0)