diff --git a/efo/efo_data_cleanup.py b/efo/efo_data_cleanup.py index 82149f0..d54e3e0 100644 --- a/efo/efo_data_cleanup.py +++ b/efo/efo_data_cleanup.py @@ -7,12 +7,13 @@ # input csv files data_dir = "/Users/morteza/workspace/notebooks/efo/data/" -csv_path = f"{data_dir}efo_pubmed_hits.v2.csv" -tasks_csv_path = f"{data_dir}efo_pubmed_tasks_hits.v2.csv" -concepts_csv_path = f"{data_dir}efo_pubmed_concepts_hits.v2.csv" +data_version = "20200114" +csv_path = f"{data_dir}efo_pubmed_hits.{data_version}.csv" +tasks_csv_path = f"{data_dir}efo_pubmed_tasks_hits.{data_version}.csv" +concepts_csv_path = f"{data_dir}efo_pubmed_concepts_hits.{data_version}.csv" # output csv file -output_csv_path = f"{data_dir}efo_pubmed_hits.v2_cleaned.csv" +output_csv_path = f"{data_dir}efo_pubmed_hits_preproc.{data_version}.csv" def merge_csv_files(tasks_concepts_hits_csv, concepts_hits_csv, tasks_hits_csv): """merge tasks, concepts, and tasks-concepts hits data files""" @@ -26,7 +27,7 @@ # drop backup columns to_drop_cols = df.filter(like = '_original').columns - df.drop(to_drop_cols, axis='columns', inplace=True) + #df.drop(to_drop_cols, axis='columns', inplace=True) return df @@ -46,11 +47,11 @@ df['task_suffixgame_concept_hits'] ) - to_drop_cols = df.filter(like = ['suffixtest','suffixtask','suffixgame').columns - df.drop(to_drop_cols, axis='columns', inplace=True) + to_drop_cols = df.filter(like = ['suffixtest','suffixtask','suffixgame']).columns + #df.drop(to_drop_cols, axis='columns', inplace=True) return df -merge_csv_files(csv_path, concepts_csv_path, tasks_csv_path, output_csv_path).to_csv(output_csv_path) +merge_csv_files(csv_path, concepts_csv_path, tasks_csv_path).to_csv(output_csv_path) #WARNING combine_hits(output_csv_path).to_csv(output_csv_path) \ No newline at end of file