Newer
Older
notebooks / python / 20200204_irt_pymc3.py
#%%

# 1. libraries 
import pandas as pd
import numpy as np
import pymc3 as pm

from scipy.special import expit
from scipy.stats import zscore
from IPython.display import display


import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az

# 2. setup themes for plots
sns.set_style('ticks')
az.style.use("arviz-darkgrid")

# 3. a function to calculate logit 
def irt_invlogit(ai,di):
  return expit(d[int(di)] -a[int(ai)])

# 4. parameters
n_students = 100
n_items = 10

# 5. Ground truth for abilities and difficulties
a = np.random.normal(0, 1, n_students)
d = np.random.normal(0, 1, n_items)

 # 6. probability of responding correctly; P is a matrix of probability values generated for each (student, item) pair
P = np.fromfunction(lambda i,j: np.vectorize(irt_invlogit)(i,j), (n_students, n_items))

# 7. observed reponses; R matrix contains success/failure
R = np.array(list(map(np.vectorize(lambda x: 1 if x > np.random.rand(1) else 0), P)))

# 8.  create a data frame to keep all data in a single place. Each row holds details of a single trial (a single cell in ability-difficulty  matrix)
trials = pd.DataFrame(columns=["student","item","ability","difficulty","prob","response"])

# 9. fill up trial data frame with values from parameters, P, R, a, and, d
for i in range(n_students):
  for j in range(n_items):
    trials.loc[i*n_items+j] = [i,j,a[i], d[j], P[i,j], R[i,j]]

# 10. student and item are categorical variables
trials['student'] = trials.student.astype('category')
trials['item'] = trials.item.astype('category')

# DEBUG
#display(trials)


# 11. create a list of indices for students and items in the trial (to use when creating PyMC3 model)
student_idx = trials.student.cat.codes.values
item_idx = trials.item.cat.codes.values

# 12. specify PyMC3 mdoel
with pm.Model() as model:

  # 12.1. model student abilities as N(0,1), one parameter per student
  ability = pm.Normal('ability', mu=0, sigma=1, shape=n_students)

  # 12.2. model item difficulty as N(0,1), one parameter per item
  difficulty = pm.Normal('difficulty', mu=0, sigma=1, shape=n_items)

  # 12.3. model probaility that a student responds correctly to an item (using invlogit function)
  p = pm.Deterministic('p', pm.math.invlogit(difficulty[item_idx]-ability[student_idx]))

  # 12.4. model likelihood for R (bernolli)
  R_obs = pm.Bernoulli('R_obs', p=p, observed=trials.response)

  # 12.5. show model diagram
  display(pm.model_to_graphviz(model))

  # DEBUG
  #display(model.check_test_point())

# 13. MCMC model fitting
trace = pm.sample(1000, model=model)

# DEBUG (MAP method)
#map_estimate = pm.find_MAP(model)

# 14. plot posteriors for student abilities
#pm.plot_posterior(trace,var_names=['ability'])

# 15. extract diagnostics and posteriors (only for ability here)
a_trace_summary = pm.summary(trace, var_names=['ability'])
a_est = a_trace_summary['mean'].values

d_trace_summary = pm.summary(trace, var_names=['difficulty'])
d_est = d_trace_summary['mean'].values

# 15. report correlation between ground truth and estimated params
print("correlation of ground truth and estimated models:")
print("abilities:\t", np.corrcoef(a,a_est)[0,1])
print("difficulties:\t", np.corrcoef(d,d_est)[0,1])




# %%