diff --git a/efo/fetch_efo_ontology_from_pubmed.py b/efo/fetch_efo_ontology_from_pubmed.py index 1b038ea..4b2e7c7 100644 --- a/efo/fetch_efo_ontology_from_pubmed.py +++ b/efo/fetch_efo_ontology_from_pubmed.py @@ -48,8 +48,6 @@ tasks = query_labels(graph, "Task") concepts = query_labels(graph, "ExecutiveFunction") -print(tasks) - print(f"Tasks: {len(tasks)}, Concepts: {len(concepts)}") time_estimate = len(tasks) * len(concepts) @@ -65,6 +63,29 @@ #tasks = ["Reversal Learning"] #concepts = ["Behavioral Control"] +def build_queries(task, concept): + return { + 'task_concept_ef_hits': f'("{task}"[TIAB]) AND ("{concept}"[TIAB]) AND ("executive function")', + 'task_suffixtask_concept_ef_hits': f'("{task} task"[TIAB]) AND ("{concept}"[TIAB]) AND ("executive function")', + 'task_suffixtest_concept_ef_hits': f'("{task} test"[TIAB]) AND ("{concept}"[TIAB]) AND ("executive function")', + 'task_suffixgame_concept_ef_hits': f'("{task} game"[TIAB]) AND ("{concept}"[TIAB]) AND ("executive function")', + 'task_concept_hits': f'("{task}"[TIAB]) AND ("{concept}"[TIAB])', + 'task_suffixtask_concept_hits': f'("{task} task"[TIAB]) AND ("{concept}"[TIAB])', + 'task_suffixtest_concept_hits': f'("{task} test"[TIAB]) AND ("{concept}"[TIAB])', + 'task_suffixgame_concept_hits': f'("{task} game"[TIAB]) AND ("{concept}"[TIAB])', + 'concept_hits': f'("{concept}"[TIAB])', + 'concept_ef_hits': f'("{concept}"[TIAB]) AND ("executive function")', + 'task_hits': f'("{task}"[TIAB])', + 'task_suffixtask_hits': f'("{task} task"[TIAB])', + 'task_suffixtest_hits': f'("{task} test"[TIAB])', + 'task_suffixgame_hits': f'("{task} game"[TIAB])', + 'task_ef_hits': f'("{task}"[TIAB]) AND ("executive function")', + 'task_suffixtask_ef_hits': f'("{task} task"[TIAB]) AND ("executive function")', + 'task_suffixgame_ef_hits': f'("{task} game"[TIAB]) AND ("executive function")', + 'task_suffixtest_ef_hits': f'("{task} test"[TIAB]) AND ("executive function")' + } + + def query_pubmed_for_task(task, concept): suffixes = ['',' task',' game',' test'] task_queries = map(lambda s: task+s, suffixes) @@ -81,7 +102,7 @@ # main loop -with open("data/efo_taskconcept_pubmed_hits.csv", "a+") as csv: +with open("data/efo_taskconcept_pubmed_hits.csv", "a",buffering=1) as csv: csv.write('task,concept,hits,suffixed_hits,concept_hits,timestamp_ms\n') for task, concept in [(task, concept) for task in tasks for concept in concepts]: millis = int(round(time.time() * 1000)) @@ -95,3 +116,29 @@ print(csv_line) csv.write(csv_line) + + +#%% +# Partial lookup: only queries pubmed if a combination of task-concept is not already fetched. +import pandas as pd + +csv_file = "data/efo_taskconcept_pubmed_hits.csv" + +fetcher = PubMedFetcher() +data = pd.read_csv(csv_file) + +with open("data/efo_taskconcept_pubmed_hits.csv", "a",buffering=1) as csv: + for task, concept in [(task, concept) for task in tasks for concept in concepts]: + task_df = data[(data.task == task) & (data.concept == concept)] + if task_df.empty: + millis = int(round(time.time() * 1000)) + + hits, suffixed_hits = query_pubmed_for_task(task, concept) + + concept_query = f"({concept}[TIAB])" + concept_hits = fetcher.pmids_for_query(query=f'{concept_query}', retmax=1000000, pmc_only=False) + + csv_line = f'{task},{concept},{len(hits)},{len(suffixed_hits)},{len(concept_hits)},{millis}\n' + + print(csv_line) + csv.write(csv_line) \ No newline at end of file