Newer
Older
notebooks / efo / efo_data_cleanup.py
#%% [markdown]
# The following script merges tasks and concepts hits into the efo_pubmed_hits dataset, and then stores
# the result in a csv file. 
#%%

import pandas as pd

# input csv files
data_dir = "/Users/morteza/workspace/notebooks/efo/data/"
csv_path = f"{data_dir}efo_pubmed_hits.v2.csv"
tasks_csv_path = f"{data_dir}efo_pubmed_tasks_hits.v2.csv"
concepts_csv_path = f"{data_dir}efo_pubmed_concepts_hits.v2.csv"

# output csv file
output_csv_path = f"{data_dir}efo_pubmed_hits.v2_cleaned.csv"

def merge_csv_files(tasks_concepts_hits_csv, concepts_hits_csv, tasks_hits_csv, output_csv):
  """merge tasks, concepts, and tasks-concepts hits data files"""
  # read inputs
  df = pd.read_csv(tasks_concepts_hits_csv)
  df_tasks = pd.read_csv(tasks_hits_csv)
  df_concepts = pd.read_csv(concepts_hits_csv)

  # merge datasets and backup original data columns if any.
  df = df.merge(df_tasks,on='task',suffixes=('_original', '')).merge(df_concepts, on='concept',suffixes=('_original', ''))

  # drop backup columns
  to_drop_cols = df.filter(like = '_original').columns
  df.drop(to_drop_cols, axis='columns', inplace=True)

  # store merged outputs
  df.to_csv(output_csv)

merge_csv_files(csv_path, concepts_csv_path, tasks_csv_path, output_csv_path)