diff --git a/py/20200204_irt_pymc3.py b/py/20200204_irt_pymc3.py new file mode 100644 index 0000000..81fd873 --- /dev/null +++ b/py/20200204_irt_pymc3.py @@ -0,0 +1,104 @@ +#%% + +# 1. libraries +import pandas as pd +import numpy as np +import pymc3 as pm + +from scipy.special import expit +from scipy.stats import zscore +from IPython.display import display + + +import matplotlib.pyplot as plt +import seaborn as sns +import arviz as az + +# 2. setup themes for plots +sns.set_style('ticks') +az.style.use("arviz-darkgrid") + +# 3. a function to calculate logit +def irt_invlogit(ai,di): + return expit(d[int(di)] -a[int(ai)]) + +# 4. parameters +n_students = 100 +n_items = 10 + +# 5. Ground truth for abilities and difficulties +a = np.random.normal(0, 1, n_students) +d = np.random.normal(0, 1, n_items) + + # 6. probability of responding correctly; P is a matrix of probability values generated for each (student, item) pair +P = np.fromfunction(lambda i,j: np.vectorize(irt_invlogit)(i,j), (n_students, n_items)) + +# 7. observed reponses; R matrix contains success/failure +R = np.array(list(map(np.vectorize(lambda x: 1 if x > np.random.rand(1) else 0), P))) + +# 8. create a data frame to keep all data in a single place. Each row holds details of a single trial (a single cell in ability-difficulty matrix) +trials = pd.DataFrame(columns=["student","item","ability","difficulty","prob","response"]) + +# 9. fill up trial data frame with values from parameters, P, R, a, and, d +for i in range(n_students): + for j in range(n_items): + trials.loc[i*n_items+j] = [i,j,a[i], d[j], P[i,j], R[i,j]] + +# 10. student and item are categorical variables +trials['student'] = trials.student.astype('category') +trials['item'] = trials.item.astype('category') + +# DEBUG +#display(trials) + + +# 11. create a list of indices for students and items in the trial (to use when creating PyMC3 model) +student_idx = trials.student.cat.codes.values +item_idx = trials.item.cat.codes.values + +# 12. specify PyMC3 mdoel +with pm.Model() as model: + + # 12.1. model student abilities as N(0,1), one parameter per student + ability = pm.Normal('ability', mu=0, sigma=1, shape=n_students) + + # 12.2. model item difficulty as N(0,1), one parameter per item + difficulty = pm.Normal('difficulty', mu=0, sigma=1, shape=n_items) + + # 12.3. model probaility that a student responds correctly to an item (using invlogit function) + p = pm.Deterministic('p', pm.math.invlogit(difficulty[item_idx]-ability[student_idx])) + + # 12.4. model likelihood for R (bernolli) + R_obs = pm.Bernoulli('R_obs', p=p, observed=trials.response) + + # 12.5. show model diagram + display(pm.model_to_graphviz(model)) + + # DEBUG + #display(model.check_test_point()) + +# 13. MCMC model fitting +trace = pm.sample(1000, model=model) + +# DEBUG (MAP method) +#map_estimate = pm.find_MAP(model) + +# 14. plot posteriors for student abilities +#pm.plot_posterior(trace,var_names=['ability']) + +# 15. extract diagnostics and posteriors (only for ability here) +a_trace_summary = pm.summary(trace, var_names=['ability']) +a_est = a_trace_summary['mean'].values + +d_trace_summary = pm.summary(trace, var_names=['difficulty']) +d_est = d_trace_summary['mean'].values + +# 15. report correlation between ground truth and estimated params +print("correlation of ground truth and estimated models:") +print("abilities:\t", np.corrcoef(a,a_est)[0,1]) +print("difficulties:\t", np.corrcoef(d,d_est)[0,1]) + + + + +# %%