Newer
Older
notebooks / stayvers2019 / eda.py
# %%

import dask.dataframe as dd
import pandas as pd
from dask.diagnostics import ProgressBar
from IPython.display import display
from pyro.distributions.torch import Categorical, Normal

# gamedata_path = '~/Downloads/gamedata_preprocessed.csv'
# gamedata_path = '~/Downloads/gamedata_original-v1.csv'
gamedata_path = '/Users/morteza/Downloads/data/Sample 1000 individuals/data/data_sim6007.csv'

# lazy load gamedata
DATA = dd.read_csv(gamedata_path)

# df.npartitions
DATA.columns
#%%
with ProgressBar():
  # number of unique subjects
  display(DATA['user_id'].value_counts().compute())
  # 1000 users

  display(DATA['task'].value_counts().compute())
  # task "1" or "2"
#%%


# dask runs queries in multiple thread; we only want one user_id though
sample_user = DATA.loc[0,'user_id'].compute().iloc[0]

SAMPLE_USER_DATA = DATA.query('user_id == @sample_user', local_dict={'sample_user': sample_user}).compute()

#%%
# generate EDA reports

from pandas_profiling import ProfileReport

profile = ProfileReport(DATA.compute(), title='Lumocity All Users EDA Report', minimal=True)
profile.to_file("all_users_eda.html")


profile = ProfileReport(SAMPLE_USER_DATA, title='Lumocity Sample User EDA Report', explorative=True)
profile.to_file("sample_user_eda.html")

#%% -----------------------------
# a simple inference model in PPL

import pyro
import pyro.distributions as dist

# 1. define model
def model(data):
  z_1 = pyro.sample('z_1', dist.HalfNormal(0,5))
  z_2 = pyro.sample('z_1', dist.HalfNormal(0,10))
  trial_type = pyro.sample('trial_type', dist.Categorical(4)
  age = pyro.sample('age', dist.Categorical(20))
  if age > 70:
    rt_mean = 1
    rt_std = 5
  else
    rt_mean = 2
    rt_std = 10
  rt = pyro.sample('rt', dist.LogNormal(mean_rt,rt_std), obs=data['rt'])
  accuracy = pyro.sample('accuracy', obs=data['accuracy'])
  return rt


# 2. run MCMC
mcmc = MCMC(NUTS(model), num_warmup=250, num_samples=1000)
mcmc.run(random.PRNGKey(0), NB)

# 3. reports
mcmc.print_summary()

#%%

# simulate Ornstein–Uhlenbeck process

import numpy as np
dt = .1
max_t = 100

asymptote = 5
rate = .1
w = 1

ts = np.arange(0, max_t, dt)
X = np.zeros_like(ts)

for t in range(1, ts.shape[0]):
  x = X[t-1]
  dx = rate * (asymptote - x) * dt
  dw = w * np.random.normal()
  X[t] = X[t-1] + dx + dw

import matplotlib.pyplot as plt

plt.plot(ts, X)
plt.hlines(asymptote, xmin=0, xmax=max_t, color='r', linestyles='-.',lw=1)
plt.xlabel('time step')
plt.show()
# X_t