#%% # 1. libraries import pandas as pd import numpy as np import pymc3 as pm from scipy.special import expit from scipy.stats import zscore from IPython.display import display import matplotlib.pyplot as plt import seaborn as sns import arviz as az # 2. setup themes for plots sns.set_style('ticks') az.style.use("arviz-darkgrid") # 3. a function to calculate logit def irt_invlogit(ai,di): return expit(d[int(di)] -a[int(ai)]) # 4. parameters n_students = 100 n_items = 10 # 5. Ground truth for abilities and difficulties a = np.random.normal(0, 1, n_students) d = np.random.normal(0, 1, n_items) # 6. probability of responding correctly; P is a matrix of probability values generated for each (student, item) pair P = np.fromfunction(lambda i,j: np.vectorize(irt_invlogit)(i,j), (n_students, n_items)) # 7. observed reponses; R matrix contains success/failure R = np.array(list(map(np.vectorize(lambda x: 1 if x > np.random.rand(1) else 0), P))) # 8. create a data frame to keep all data in a single place. Each row holds details of a single trial (a single cell in ability-difficulty matrix) trials = pd.DataFrame(columns=["student","item","ability","difficulty","prob","response"]) # 9. fill up trial data frame with values from parameters, P, R, a, and, d for i in range(n_students): for j in range(n_items): trials.loc[i*n_items+j] = [i,j,a[i], d[j], P[i,j], R[i,j]] # 10. student and item are categorical variables trials['student'] = trials.student.astype('category') trials['item'] = trials.item.astype('category') # DEBUG #display(trials) # 11. create a list of indices for students and items in the trial (to use when creating PyMC3 model) student_idx = trials.student.cat.codes.values item_idx = trials.item.cat.codes.values # 12. specify PyMC3 mdoel with pm.Model() as model: # 12.1. model student abilities as N(0,1), one parameter per student ability = pm.Normal('ability', mu=0, sigma=1, shape=n_students) # 12.2. model item difficulty as N(0,1), one parameter per item difficulty = pm.Normal('difficulty', mu=0, sigma=1, shape=n_items) # 12.3. model probaility that a student responds correctly to an item (using invlogit function) p = pm.Deterministic('p', pm.math.invlogit(difficulty[item_idx]-ability[student_idx])) # 12.4. model likelihood for R (bernolli) R_obs = pm.Bernoulli('R_obs', p=p, observed=trials.response) # 12.5. show model diagram display(pm.model_to_graphviz(model)) # DEBUG #display(model.check_test_point()) # 13. MCMC model fitting trace = pm.sample(1000, model=model) # DEBUG (MAP method) #map_estimate = pm.find_MAP(model) # 14. plot posteriors for student abilities #pm.plot_posterior(trace,var_names=['ability']) # 15. extract diagnostics and posteriors (only for ability here) a_trace_summary = pm.summary(trace, var_names=['ability']) a_est = a_trace_summary['mean'].values d_trace_summary = pm.summary(trace, var_names=['difficulty']) d_est = d_trace_summary['mean'].values # 15. report correlation between ground truth and estimated params print("correlation of ground truth and estimated models:") print("abilities:\t", np.corrcoef(a,a_est)[0,1]) print("difficulties:\t", np.corrcoef(d,d_est)[0,1]) # %%