notebooks/efo/fetch_efo_ontology_from_pubmed.py at b357b76b8f799d3d748461cf0b689744b333ee94

Fork: 0
morteza / notebooks
Find file
Newer
Older
notebooks / efo / fetch_efo_ontology_from_pubmed.py
Morteza Ansarinia on 12 Jan 2020 6 KB add new pubmed csv generator (v2) that supports several queries
Raw Blame History
#%% [markdown]
# The following codes query the EFO ontology and retrives tasks and concepts that are assigned with readable labels. Then search PubMed Central for the number of articles on 

#%%
# pip install rdflib
from rdflib import OWL, Graph
from rdflib.util import guess_format
from owlready2 import *

import time

from rdflib import URIRef

from rdflib import Graph
from rdflib.namespace import RDFS

import pandas as pd

owl_path = "file:///Users/morteza/workspace/ontologies/efo.owl"
owl_prefix = "http://www.semanticweb.org/morteza/ontologies/2019/11/executive-functions-ontology#"

efo = get_ontology(owl_path).load()

# extract class names of the tasks and concepts
#tasks = [t.name for t in efo.search(subclass_of = efo.Task)]
#concepts = [c.name for c in efo.search(subclass_of = efo.ExecutiveFunction)]

# the following code but queries the RDFS labels defined for tasks and concepts
# to query all descendants use "rdfs:subClassOf*" instead.

def query_labels(graph, parent_class):
    class_name = parent_class[1:] if parent_class.startswith(":") else parent_class
    query = f"""
    prefix : <{owl_prefix}>

    SELECT ?label
    WHERE {{
    ?task rdfs:subClassOf* :{class_name};
            rdfs:label ?label
    }}
    """

    # select the all rdfs:labels, flatten the list of labels, and convert them to python string
    labels = [labels for labels in graph.query(query)]
    flatten_labels = [l.toPython() for ll in labels for l in ll]
    return flatten_labels

# preapre RDFLib graph for SPARQL queries
graph = default_world.as_rdflib_graph()

tasks = query_labels(graph, "Task")
concepts = query_labels(graph, "ExecutiveFunction")

print(f"Tasks: {len(tasks)}, Concepts: {len(concepts)}")

time_estimate = len(tasks) * len(concepts)

print(f"it takes ~ {time_estimate}s to query PubMed Central for these tasks and concepts.")

#%%
# goal: create rows with the following data: <task>,<concept>,<hits>,<task_total>
# Partial lookup: only queries pubmed if a combination of task-concept is not already fetched.

csv_file = "/Users/morteza/workspace/notebooks/efo/data/efo_taskconcept_pubmed_hits.csv"

from metapub import PubMedFetcher
fetcher = PubMedFetcher()

def query_pubmed_for_task(task, concept):
    suffixes = ['',' task',' game',' test']
    task_queries = map(lambda s: task+s, suffixes)

    suffixed_hits = []
    hits = []
    for tq in task_queries:
        query = f"({tq}[TIAB]) AND ({concept}[TIAB])"
        pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, pmc_only=False)
        suffixed_hits += pmids
        if tq == task: hits = pmids

    return (hits, suffixed_hits)



data = pd.read_csv(csv_file)

with open(csv_file, "a",buffering=1) as csv:
    for task, concept in [(task, concept) for task in tasks for concept in concepts]:
        task_df = data[(data.task == task) & (data.concept == concept)]
        if task_df.empty:
            millis = int(round(time.time() * 1000))

            hits, suffixed_hits = query_pubmed_for_task(task, concept)

            concept_query = f"({concept}[TIAB])"
            concept_hits = fetcher.pmids_for_query(query=f'{concept_query}', retmax=1000000, pmc_only=False)
            
            csv_line = f'{task},{concept},{len(hits)},{len(suffixed_hits)},{len(concept_hits)},{millis}\n'
            
            print(csv_line)
            csv.write(csv_line)

#%% v2: creates a CSV filled with number of hits but csv file has a column for every hit-query

#TODO reduce the number of queries (e.g. perform repetative task and concept queries only once)
#TODO initiate csv with headers if empty

def build_queries(task, concept):
    return {
    'task_concept_ef_hits':             f'("{task}"[TIAB]) AND ("{concept}"[TIAB]) AND ("executive function")',
    'task_suffixtask_concept_ef_hits':  f'("{task} task"[TIAB]) AND ("{concept}"[TIAB]) AND ("executive function")',
    'task_suffixtest_concept_ef_hits':  f'("{task} test"[TIAB]) AND ("{concept}"[TIAB]) AND ("executive function")',
    'task_suffixgame_concept_ef_hits':  f'("{task} game"[TIAB]) AND ("{concept}"[TIAB]) AND ("executive function")',
    'task_concept_hits':                f'("{task}"[TIAB]) AND ("{concept}"[TIAB])',
    'task_suffixtask_concept_hits':     f'("{task} task"[TIAB]) AND ("{concept}"[TIAB])',
    'task_suffixtest_concept_hits':     f'("{task} test"[TIAB]) AND ("{concept}"[TIAB])',
    'task_suffixgame_concept_hits':     f'("{task} game"[TIAB]) AND ("{concept}"[TIAB])',
    'concept_hits':                     f'("{concept}"[TIAB])',
    'concept_ef_hits':                  f'("{concept}"[TIAB]) AND ("executive function")',
    'task_hits':                        f'("{task}"[TIAB])',
    'task_suffixtask_hits':             f'("{task} task"[TIAB])',
    'task_suffixtest_hits':             f'("{task} test"[TIAB])',
    'task_suffixgame_hits':             f'("{task} game"[TIAB])',
    'task_ef_hits':                     f'("{task}"[TIAB]) AND ("executive function")',
    'task_suffixtask_ef_hits':          f'("{task} task"[TIAB]) AND ("executive function")',
    'task_suffixgame_ef_hits':          f'("{task} game"[TIAB]) AND ("executive function")',
    'task_suffixtest_ef_hits':          f'("{task} test"[TIAB]) AND ("executive function")'
    }

def csv_header():
    headers = ['task','concept','timestamp_ms']
    headers += build_queries('','').keys()
    return ','.join(headers)
def query_pubmed_and_create_csv(tasks, concepts, csv_file):

    try:    data = pd.read_csv(csv_file)
    except: data = pd.DataFrame({'task':[],'concept':[]})

    with open(csv_file, "a+",buffering=1) as csv:
        if len(csv.read()) == 0:
            print("Empty csv found, generating csv headers...")
            csv.write(csv_header() + '\n')

        for task, concept in [(task, concept) for task in tasks for concept in concepts]:
            previously_stored = data[(data.task == task) & (data.concept == concept)]
            if previously_stored.empty:
                millis = int(round(time.time() * 1000))
                csv_cells = [task, concept, str(millis)]
                for qkey, query in build_queries(task, concept).items():
                    qhits = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, pmc_only=False)
                    csv_cells += [f"{len(qhits)}"]
                
                csv_line = ','.join(csv_cells) + '\n'
                
                print(csv_line)
                csv.write(csv_line)

query_pubmed_and_create_csv(tasks, concepts, "/Users/morteza/workspace/notebooks/efo/data/efo_pubmed_hits.v2.csv")