diff --git a/efo/fetch_efo_ontology_from_pubmed.py b/efo/fetch_efo_ontology_from_pubmed.py index afe98b0..1b038ea 100644 --- a/efo/fetch_efo_ontology_from_pubmed.py +++ b/efo/fetch_efo_ontology_from_pubmed.py @@ -1,9 +1,14 @@ +#%% [markdown] +# The following codes query the EFO ontology and retrives tasks and concepts that are assigned with readable labels. Then search PubMed Central for the number of articles on + #%% # pip install rdflib from rdflib import OWL, Graph from rdflib.util import guess_format from owlready2 import * +import time + from rdflib import URIRef from rdflib import Graph @@ -27,12 +32,15 @@ SELECT ?label WHERE {{ - ?task rdfs:subClassOf :{class_name}; + ?task rdfs:subClassOf* :{class_name}; rdfs:label ?label }} """ - # select the first rdfs:label and convert it to python string - return [labels[0].toPython() for labels in graph.query(query)] + + # select the all rdfs:labels, flatten the list of labels, and convert them to python string + labels = [labels for labels in graph.query(query)] + flatten_labels = [l.toPython() for ll in labels for l in ll] + return flatten_labels # preapre RDFLib graph for SPARQL queries graph = default_world.as_rdflib_graph() @@ -40,9 +48,13 @@ tasks = query_labels(graph, "Task") concepts = query_labels(graph, "ExecutiveFunction") +print(tasks) print(f"Tasks: {len(tasks)}, Concepts: {len(concepts)}") +time_estimate = len(tasks) * len(concepts) + +print(f"it taks ~ {time_estimate}s to query PubMed Central for these tasks and concepts.") #%% # goal: create rows with the following data: ,,, @@ -50,7 +62,36 @@ from metapub import PubMedFetcher fetcher = PubMedFetcher() -for task, concept in [(task, concept) for task in tasks for concept in concepts]: - query = f"({task}[TIAB]) AND ({concept}[TIAB])" - pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, since='2010', pmc_only=True) - print(f"{query}: {len(pmids)}") +#tasks = ["Reversal Learning"] +#concepts = ["Behavioral Control"] + +def query_pubmed_for_task(task, concept): + suffixes = ['',' task',' game',' test'] + task_queries = map(lambda s: task+s, suffixes) + + suffixed_hits = [] + hits = [] + for tq in task_queries: + query = f"({tq}[TIAB]) AND ({concept}[TIAB])" + pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, pmc_only=False) + suffixed_hits += pmids + if tq == task: hits = pmids + + return (hits, suffixed_hits) + +# main loop + +with open("data/efo_taskconcept_pubmed_hits.csv", "a+") as csv: + csv.write('task,concept,hits,suffixed_hits,concept_hits,timestamp_ms\n') + for task, concept in [(task, concept) for task in tasks for concept in concepts]: + millis = int(round(time.time() * 1000)) + + hits, suffixed_hits = query_pubmed_for_task(task, concept) + + concept_query = f"({concept}[TIAB])" + concept_hits = fetcher.pmids_for_query(query=f'{concept_query}', retmax=1000000, pmc_only=False) + + csv_line = f'{task},{concept},{len(hits)},{len(suffixed_hits)},{len(concept_hits)},{millis}\n' + + print(csv_line) + csv.write(csv_line) diff --git a/efo/fetch_efo_ontology_from_pubmed.py b/efo/fetch_efo_ontology_from_pubmed.py index afe98b0..1b038ea 100644 --- a/efo/fetch_efo_ontology_from_pubmed.py +++ b/efo/fetch_efo_ontology_from_pubmed.py @@ -1,9 +1,14 @@ +#%% [markdown] +# The following codes query the EFO ontology and retrives tasks and concepts that are assigned with readable labels. Then search PubMed Central for the number of articles on + #%% # pip install rdflib from rdflib import OWL, Graph from rdflib.util import guess_format from owlready2 import * +import time + from rdflib import URIRef from rdflib import Graph @@ -27,12 +32,15 @@ SELECT ?label WHERE {{ - ?task rdfs:subClassOf :{class_name}; + ?task rdfs:subClassOf* :{class_name}; rdfs:label ?label }} """ - # select the first rdfs:label and convert it to python string - return [labels[0].toPython() for labels in graph.query(query)] + + # select the all rdfs:labels, flatten the list of labels, and convert them to python string + labels = [labels for labels in graph.query(query)] + flatten_labels = [l.toPython() for ll in labels for l in ll] + return flatten_labels # preapre RDFLib graph for SPARQL queries graph = default_world.as_rdflib_graph() @@ -40,9 +48,13 @@ tasks = query_labels(graph, "Task") concepts = query_labels(graph, "ExecutiveFunction") +print(tasks) print(f"Tasks: {len(tasks)}, Concepts: {len(concepts)}") +time_estimate = len(tasks) * len(concepts) + +print(f"it taks ~ {time_estimate}s to query PubMed Central for these tasks and concepts.") #%% # goal: create rows with the following data: ,,, @@ -50,7 +62,36 @@ from metapub import PubMedFetcher fetcher = PubMedFetcher() -for task, concept in [(task, concept) for task in tasks for concept in concepts]: - query = f"({task}[TIAB]) AND ({concept}[TIAB])" - pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, since='2010', pmc_only=True) - print(f"{query}: {len(pmids)}") +#tasks = ["Reversal Learning"] +#concepts = ["Behavioral Control"] + +def query_pubmed_for_task(task, concept): + suffixes = ['',' task',' game',' test'] + task_queries = map(lambda s: task+s, suffixes) + + suffixed_hits = [] + hits = [] + for tq in task_queries: + query = f"({tq}[TIAB]) AND ({concept}[TIAB])" + pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, pmc_only=False) + suffixed_hits += pmids + if tq == task: hits = pmids + + return (hits, suffixed_hits) + +# main loop + +with open("data/efo_taskconcept_pubmed_hits.csv", "a+") as csv: + csv.write('task,concept,hits,suffixed_hits,concept_hits,timestamp_ms\n') + for task, concept in [(task, concept) for task in tasks for concept in concepts]: + millis = int(round(time.time() * 1000)) + + hits, suffixed_hits = query_pubmed_for_task(task, concept) + + concept_query = f"({concept}[TIAB])" + concept_hits = fetcher.pmids_for_query(query=f'{concept_query}', retmax=1000000, pmc_only=False) + + csv_line = f'{task},{concept},{len(hits)},{len(suffixed_hits)},{len(concept_hits)},{millis}\n' + + print(csv_line) + csv.write(csv_line) diff --git a/py/vscode_jupyter_test.py b/py/vscode_jupyter_test.py deleted file mode 100644 index ccecc43..0000000 --- a/py/vscode_jupyter_test.py +++ /dev/null @@ -1,4 +0,0 @@ -#%% -print("test") - -#%%