diff --git a/efo/fetch_efo_ontology_from_pubmed.py b/efo/fetch_efo_ontology_from_pubmed.py new file mode 100644 index 0000000..afe98b0 --- /dev/null +++ b/efo/fetch_efo_ontology_from_pubmed.py @@ -0,0 +1,56 @@ +#%% +# pip install rdflib +from rdflib import OWL, Graph +from rdflib.util import guess_format +from owlready2 import * + +from rdflib import URIRef + +from rdflib import Graph +from rdflib.namespace import RDFS + +owl_path = "file:///Users/morteza/workspace/ontologies/efo.owl" + +efo = get_ontology(owl_path).load() + +# extract class names of the tasks and concepts +#tasks = [t.name for t in efo.search(subclass_of = efo.Task)] +#concepts = [c.name for c in efo.search(subclass_of = efo.ExecutiveFunction)] + +# the following code but queries the RDFS labels defined for tasks and concepts +# to query all descendants use "rdfs:subClassOf*" instead. + +def query_labels(graph, parent_class): + class_name = parent_class[1:] if parent_class.startswith(":") else parent_class + query = f""" + prefix : + + SELECT ?label + WHERE {{ + ?task rdfs:subClassOf :{class_name}; + rdfs:label ?label + }} + """ + # select the first rdfs:label and convert it to python string + return [labels[0].toPython() for labels in graph.query(query)] + +# preapre RDFLib graph for SPARQL queries +graph = default_world.as_rdflib_graph() + +tasks = query_labels(graph, "Task") +concepts = query_labels(graph, "ExecutiveFunction") + + +print(f"Tasks: {len(tasks)}, Concepts: {len(concepts)}") + + +#%% +# goal: create rows with the following data: ,,, + +from metapub import PubMedFetcher +fetcher = PubMedFetcher() + +for task, concept in [(task, concept) for task in tasks for concept in concepts]: + query = f"({task}[TIAB]) AND ({concept}[TIAB])" + pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, since='2010', pmc_only=True) + print(f"{query}: {len(pmids)}")