notebooks/efo/fetch_efo_ontology_from_pubmed.py at d7d285ff20d49a046ab4814637606ffe231b6df0

Fork: 0

morteza / notebooks

Find file

Newer

Older

notebooks / efo / fetch_efo_ontology_from_pubmed.py

Morteza Ansarinia on 9 Jan 2020 1 KB a new script to query concepts and tasks from EFO ontology and then count pubmed papers for the combinations

Raw Blame History

#%%
# pip install rdflib
from rdflib import OWL, Graph
from rdflib.util import guess_format
from owlready2 import *

from rdflib import URIRef

from rdflib import Graph
from rdflib.namespace import RDFS

owl_path = "file:///Users/morteza/workspace/ontologies/efo.owl"

efo = get_ontology(owl_path).load()

# extract class names of the tasks and concepts
#tasks = [t.name for t in efo.search(subclass_of = efo.Task)]
#concepts = [c.name for c in efo.search(subclass_of = efo.ExecutiveFunction)]

# the following code but queries the RDFS labels defined for tasks and concepts
# to query all descendants use "rdfs:subClassOf*" instead.

def query_labels(graph, parent_class):
    class_name = parent_class[1:] if parent_class.startswith(":") else parent_class
    query = f"""
    prefix : <http://www.semanticweb.org/morteza/ontologies/2019/11/executive-functions-ontology#>

    SELECT ?label
    WHERE {{
    ?task rdfs:subClassOf :{class_name};
            rdfs:label ?label
    }}
    """
    # select the first rdfs:label and convert it to python string
    return [labels[0].toPython() for labels in graph.query(query)]

# preapre RDFLib graph for SPARQL queries
graph = default_world.as_rdflib_graph()

tasks = query_labels(graph, "Task")
concepts = query_labels(graph, "ExecutiveFunction")


print(f"Tasks: {len(tasks)}, Concepts: {len(concepts)}")


#%%
# goal: create rows with the following data: <task>,<concept>,<hits>,<task_total>

from metapub import PubMedFetcher
fetcher = PubMedFetcher()

for task, concept in [(task, concept) for task in tasks for concept in concepts]:
    query = f"({task}[TIAB]) AND ({concept}[TIAB])"
    pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, since='2010', pmc_only=True)
    print(f"{query}: {len(pmids)}")