# %%
import dask.dataframe as dd
import pandas as pd
from dask.diagnostics import ProgressBar
from IPython.display import display
from pyro.distributions.torch import Categorical, Normal
# gamedata_path = '~/Downloads/gamedata_preprocessed.csv'
# gamedata_path = '~/Downloads/gamedata_original-v1.csv'
gamedata_path = '/Users/morteza/Downloads/data/Sample 1000 individuals/data/data_sim6007.csv'
# lazy load gamedata
DATA = dd.read_csv(gamedata_path)
# df.npartitions
DATA.columns
#%%
with ProgressBar():
# number of unique subjects
display(DATA['user_id'].value_counts().compute())
# 1000 users
display(DATA['task'].value_counts().compute())
# task "1" or "2"
#%%
# dask runs queries in multiple thread; we only want one user_id though
sample_user = DATA.loc[0,'user_id'].compute().iloc[0]
SAMPLE_USER_DATA = DATA.query('user_id == @sample_user', local_dict={'sample_user': sample_user}).compute()
#%%
# generate EDA reports
from pandas_profiling import ProfileReport
profile = ProfileReport(DATA.compute(), title='Lumocity All Users EDA Report', minimal=True)
profile.to_file("all_users_eda.html")
profile = ProfileReport(SAMPLE_USER_DATA, title='Lumocity Sample User EDA Report', explorative=True)
profile.to_file("sample_user_eda.html")
#%% -----------------------------
# a simple inference model in PPL
import pyro
import pyro.distributions as dist
# 1. define model
def model(data):
z_1 = pyro.sample('z_1', dist.HalfNormal(0,5))
z_2 = pyro.sample('z_1', dist.HalfNormal(0,10))
trial_type = pyro.sample('trial_type', dist.Categorical(4)
age = pyro.sample('age', dist.Categorical(20))
if age > 70:
rt_mean = 1
rt_std = 5
else
rt_mean = 2
rt_std = 10
rt = pyro.sample('rt', dist.LogNormal(mean_rt,rt_std), obs=data['rt'])
accuracy = pyro.sample('accuracy', obs=data['accuracy'])
return rt
# 2. run MCMC
mcmc = MCMC(NUTS(model), num_warmup=250, num_samples=1000)
mcmc.run(random.PRNGKey(0), NB)
# 3. reports
mcmc.print_summary()
#%%
# simulate Ornstein–Uhlenbeck process
import numpy as np
dt = .1
max_t = 100
asymptote = 5
rate = .1
w = 1
ts = np.arange(0, max_t, dt)
X = np.zeros_like(ts)
for t in range(1, ts.shape[0]):
x = X[t-1]
dx = rate * (asymptote - x) * dt
dw = w * np.random.normal()
X[t] = X[t-1] + dx + dw
import matplotlib.pyplot as plt
plt.plot(ts, X)
plt.hlines(asymptote, xmin=0, xmax=max_t, color='r', linestyles='-.',lw=1)
plt.xlabel('time step')
plt.show()
# X_t