#%% [markdown] # The following script merges tasks and concepts hits into the efo_pubmed_hits dataset, and then stores # the result in a csv file. #%% import pandas as pd # input csv files data_dir = "/Users/morteza/workspace/notebooks/efo/data/" csv_path = f"{data_dir}efo_pubmed_hits.v2.csv" tasks_csv_path = f"{data_dir}efo_pubmed_tasks_hits.v2.csv" concepts_csv_path = f"{data_dir}efo_pubmed_concepts_hits.v2.csv" # output csv file output_csv_path = f"{data_dir}efo_pubmed_hits.v2_cleaned.csv" def merge_csv_files(tasks_concepts_hits_csv, concepts_hits_csv, tasks_hits_csv, output_csv): """merge tasks, concepts, and tasks-concepts hits data files""" # read inputs df = pd.read_csv(tasks_concepts_hits_csv) df_tasks = pd.read_csv(tasks_hits_csv) df_concepts = pd.read_csv(concepts_hits_csv) # merge datasets and backup original data columns if any. df = df.merge(df_tasks,on='task',suffixes=('_original', '')).merge(df_concepts, on='concept',suffixes=('_original', '')) # drop backup columns to_drop_cols = df.filter(like = '_original').columns df.drop(to_drop_cols, axis='columns', inplace=True) # store merged outputs df.to_csv(output_csv) merge_csv_files(csv_path, concepts_csv_path, tasks_csv_path, output_csv_path)