#%%
# 1. libraries
import pandas as pd
import numpy as np
import pymc3 as pm
from scipy.special import expit
from scipy.stats import zscore
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az
# 2. setup themes for plots
sns.set_style('ticks')
az.style.use("arviz-darkgrid")
# 3. a function to calculate logit
def irt_invlogit(ai,di):
return expit(d[int(di)] -a[int(ai)])
# 4. parameters
n_students = 100
n_items = 10
# 5. Ground truth for abilities and difficulties
a = np.random.normal(0, 1, n_students)
d = np.random.normal(0, 1, n_items)
# 6. probability of responding correctly; P is a matrix of probability values generated for each (student, item) pair
P = np.fromfunction(lambda i,j: np.vectorize(irt_invlogit)(i,j), (n_students, n_items))
# 7. observed reponses; R matrix contains success/failure
R = np.array(list(map(np.vectorize(lambda x: 1 if x > np.random.rand(1) else 0), P)))
# 8. create a data frame to keep all data in a single place. Each row holds details of a single trial (a single cell in ability-difficulty matrix)
trials = pd.DataFrame(columns=["student","item","ability","difficulty","prob","response"])
# 9. fill up trial data frame with values from parameters, P, R, a, and, d
for i in range(n_students):
for j in range(n_items):
trials.loc[i*n_items+j] = [i,j,a[i], d[j], P[i,j], R[i,j]]
# 10. student and item are categorical variables
trials['student'] = trials.student.astype('category')
trials['item'] = trials.item.astype('category')
# DEBUG
#display(trials)
# 11. create a list of indices for students and items in the trial (to use when creating PyMC3 model)
student_idx = trials.student.cat.codes.values
item_idx = trials.item.cat.codes.values
# 12. specify PyMC3 mdoel
with pm.Model() as model:
# 12.1. model student abilities as N(0,1), one parameter per student
ability = pm.Normal('ability', mu=0, sigma=1, shape=n_students)
# 12.2. model item difficulty as N(0,1), one parameter per item
difficulty = pm.Normal('difficulty', mu=0, sigma=1, shape=n_items)
# 12.3. model probaility that a student responds correctly to an item (using invlogit function)
p = pm.Deterministic('p', pm.math.invlogit(difficulty[item_idx]-ability[student_idx]))
# 12.4. model likelihood for R (bernolli)
R_obs = pm.Bernoulli('R_obs', p=p, observed=trials.response)
# 12.5. show model diagram
display(pm.model_to_graphviz(model))
# DEBUG
#display(model.check_test_point())
# 13. MCMC model fitting
trace = pm.sample(1000, model=model)
# DEBUG (MAP method)
#map_estimate = pm.find_MAP(model)
# 14. plot posteriors for student abilities
#pm.plot_posterior(trace,var_names=['ability'])
# 15. extract diagnostics and posteriors (only for ability here)
a_trace_summary = pm.summary(trace, var_names=['ability'])
a_est = a_trace_summary['mean'].values
d_trace_summary = pm.summary(trace, var_names=['difficulty'])
d_est = d_trace_summary['mean'].values
# 15. report correlation between ground truth and estimated params
print("correlation of ground truth and estimated models:")
print("abilities:\t", np.corrcoef(a,a_est)[0,1])
print("difficulties:\t", np.corrcoef(d,d_est)[0,1])
# %%