#%% # pip install rdflib from rdflib import OWL, Graph from rdflib.util import guess_format from owlready2 import * from rdflib import URIRef from rdflib import Graph from rdflib.namespace import RDFS owl_path = "file:///Users/morteza/workspace/ontologies/efo.owl" efo = get_ontology(owl_path).load() # extract class names of the tasks and concepts #tasks = [t.name for t in efo.search(subclass_of = efo.Task)] #concepts = [c.name for c in efo.search(subclass_of = efo.ExecutiveFunction)] # the following code but queries the RDFS labels defined for tasks and concepts # to query all descendants use "rdfs:subClassOf*" instead. def query_labels(graph, parent_class): class_name = parent_class[1:] if parent_class.startswith(":") else parent_class query = f""" prefix : <http://www.semanticweb.org/morteza/ontologies/2019/11/executive-functions-ontology#> SELECT ?label WHERE {{ ?task rdfs:subClassOf :{class_name}; rdfs:label ?label }} """ # select the first rdfs:label and convert it to python string return [labels[0].toPython() for labels in graph.query(query)] # preapre RDFLib graph for SPARQL queries graph = default_world.as_rdflib_graph() tasks = query_labels(graph, "Task") concepts = query_labels(graph, "ExecutiveFunction") print(f"Tasks: {len(tasks)}, Concepts: {len(concepts)}") #%% # goal: create rows with the following data: <task>,<concept>,<hits>,<task_total> from metapub import PubMedFetcher fetcher = PubMedFetcher() for task, concept in [(task, concept) for task in tasks for concept in concepts]: query = f"({task}[TIAB]) AND ({concept}[TIAB])" pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, since='2010', pmc_only=True) print(f"{query}: {len(pmids)}")