# %% import dask.dataframe as dd import pandas as pd from dask.diagnostics import ProgressBar from IPython.display import display from pyro.distributions.torch import Categorical, Normal # gamedata_path = '~/Downloads/gamedata_preprocessed.csv' # gamedata_path = '~/Downloads/gamedata_original-v1.csv' gamedata_path = '/Users/morteza/Downloads/data/Sample 1000 individuals/data/data_sim6007.csv' # lazy load gamedata DATA = dd.read_csv(gamedata_path) # df.npartitions DATA.columns #%% with ProgressBar(): # number of unique subjects display(DATA['user_id'].value_counts().compute()) # 1000 users display(DATA['task'].value_counts().compute()) # task "1" or "2" #%% # dask runs queries in multiple thread; we only want one user_id though sample_user = DATA.loc[0,'user_id'].compute().iloc[0] SAMPLE_USER_DATA = DATA.query('user_id == @sample_user', local_dict={'sample_user': sample_user}).compute() #%% # generate EDA reports from pandas_profiling import ProfileReport profile = ProfileReport(DATA.compute(), title='Lumocity All Users EDA Report', minimal=True) profile.to_file("all_users_eda.html") profile = ProfileReport(SAMPLE_USER_DATA, title='Lumocity Sample User EDA Report', explorative=True) profile.to_file("sample_user_eda.html") #%% ----------------------------- # a simple inference model in PPL import pyro import pyro.distributions as dist # 1. define model def model(data): z_1 = pyro.sample('z_1', dist.HalfNormal(0,5)) z_2 = pyro.sample('z_1', dist.HalfNormal(0,10)) trial_type = pyro.sample('trial_type', dist.Categorical(4) age = pyro.sample('age', dist.Categorical(20)) if age > 70: rt_mean = 1 rt_std = 5 else rt_mean = 2 rt_std = 10 rt = pyro.sample('rt', dist.LogNormal(mean_rt,rt_std), obs=data['rt']) accuracy = pyro.sample('accuracy', obs=data['accuracy']) return rt # 2. run MCMC mcmc = MCMC(NUTS(model), num_warmup=250, num_samples=1000) mcmc.run(random.PRNGKey(0), NB) # 3. reports mcmc.print_summary() #%% # simulate Ornstein–Uhlenbeck process import numpy as np dt = .1 max_t = 100 asymptote = 5 rate = .1 w = 1 ts = np.arange(0, max_t, dt) X = np.zeros_like(ts) for t in range(1, ts.shape[0]): x = X[t-1] dx = rate * (asymptote - x) * dt dw = w * np.random.normal() X[t] = X[t-1] + dx + dw import matplotlib.pyplot as plt plt.plot(ts, X) plt.hlines(asymptote, xmin=0, xmax=max_t, color='r', linestyles='-.',lw=1) plt.xlabel('time step') plt.show() # X_t