Newer
Older
notebooks / stayvers2019 / eda.py
# %%

import dask.dataframe as dd
import pandas as pd

# gamedata_path = '~/Downloads/gamedata_preprocessed.csv'
# gamedata_path = '~/Downloads/gamedata_original-v1.csv'
gamedata_path = '/Users/morteza/Downloads/data/Sample 1000 individuals/data/data_sim6007.csv'

# lazy load gamedata
DATA = dd.read_csv(gamedata_path)

# df.npartitions
DATA.columns
#%%
# number of unique subjects
DATA['user_id'].value_counts().compute()
# 1000 users

DATA['task'].value_counts().compute()
# task "1" or "2"
#%%

# dask runs queries in multiple thread; we only want one user_id though
sample_user = DATA.loc[0,'user_id'].compute().iloc[0]

SAMPLE_USER_DATA = DATA.query('user_id == @sample_user', local_dict={'sample_user': sample_user}).compute()

#%%
# generate EDA reports

from pandas_profiling import ProfileReport

profile = ProfileReport(DATA.compute(), title='Lumocity All Users EDA Report', minimal=True)
profile.to_file("all_users_eda.html")


profile = ProfileReport(SAMPLE_USER_DATA, title='Lumocity Sample User EDA Report', explorative=True)
profile.to_file("sample_user_eda.html")