# %% import dask.dataframe as dd import pandas as pd # gamedata_path = '~/Downloads/gamedata_preprocessed.csv' # gamedata_path = '~/Downloads/gamedata_original-v1.csv' gamedata_path = '/Users/morteza/Downloads/data/Sample 1000 individuals/data/data_sim6007.csv' # lazy load gamedata DATA = dd.read_csv(gamedata_path) # df.npartitions DATA.columns #%% # number of unique subjects DATA['user_id'].value_counts().compute() # 1000 users DATA['task'].value_counts().compute() # task "1" or "2" #%% # dask runs queries in multiple thread; we only want one user_id though sample_user = DATA.loc[0,'user_id'].compute().iloc[0] SAMPLE_USER_DATA = DATA.query('user_id == @sample_user', local_dict={'sample_user': sample_user}).compute() #%% # generate EDA reports from pandas_profiling import ProfileReport profile = ProfileReport(DATA.compute(), title='Lumocity All Users EDA Report', minimal=True) profile.to_file("all_users_eda.html") profile = ProfileReport(SAMPLE_USER_DATA, title='Lumocity Sample User EDA Report', explorative=True) profile.to_file("sample_user_eda.html")