diff --git a/efo/fetch_efo_ontology_from_pubmed.py b/efo/fetch_efo_ontology_from_pubmed.py index 4b2e7c7..c1cba85 100644 --- a/efo/fetch_efo_ontology_from_pubmed.py +++ b/efo/fetch_efo_ontology_from_pubmed.py @@ -14,7 +14,10 @@ from rdflib import Graph from rdflib.namespace import RDFS +import pandas as pd + owl_path = "file:///Users/morteza/workspace/ontologies/efo.owl" +owl_prefix = "http://www.semanticweb.org/morteza/ontologies/2019/11/executive-functions-ontology#" efo = get_ontology(owl_path).load() @@ -28,7 +31,7 @@ def query_labels(graph, parent_class): class_name = parent_class[1:] if parent_class.startswith(":") else parent_class query = f""" - prefix : + prefix : <{owl_prefix}> SELECT ?label WHERE {{ @@ -52,16 +55,55 @@ time_estimate = len(tasks) * len(concepts) -print(f"it taks ~ {time_estimate}s to query PubMed Central for these tasks and concepts.") +print(f"it takes ~ {time_estimate}s to query PubMed Central for these tasks and concepts.") #%% # goal: create rows with the following data: ,,, +# Partial lookup: only queries pubmed if a combination of task-concept is not already fetched. + +csv_file = "/Users/morteza/workspace/notebooks/efo/data/efo_taskconcept_pubmed_hits.csv" from metapub import PubMedFetcher fetcher = PubMedFetcher() -#tasks = ["Reversal Learning"] -#concepts = ["Behavioral Control"] +def query_pubmed_for_task(task, concept): + suffixes = ['',' task',' game',' test'] + task_queries = map(lambda s: task+s, suffixes) + + suffixed_hits = [] + hits = [] + for tq in task_queries: + query = f"({tq}[TIAB]) AND ({concept}[TIAB])" + pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, pmc_only=False) + suffixed_hits += pmids + if tq == task: hits = pmids + + return (hits, suffixed_hits) + + + +data = pd.read_csv(csv_file) + +with open(csv_file, "a",buffering=1) as csv: + for task, concept in [(task, concept) for task in tasks for concept in concepts]: + task_df = data[(data.task == task) & (data.concept == concept)] + if task_df.empty: + millis = int(round(time.time() * 1000)) + + hits, suffixed_hits = query_pubmed_for_task(task, concept) + + concept_query = f"({concept}[TIAB])" + concept_hits = fetcher.pmids_for_query(query=f'{concept_query}', retmax=1000000, pmc_only=False) + + csv_line = f'{task},{concept},{len(hits)},{len(suffixed_hits)},{len(concept_hits)},{millis}\n' + + print(csv_line) + csv.write(csv_line) + +#%% v2: creates a CSV filled with number of hits but csv file has a column for every hit-query + +#TODO reduce the number of queries (e.g. perform repetative task and concept queries only once) +#TODO initiate csv with headers if empty def build_queries(task, concept): return { @@ -85,60 +127,32 @@ 'task_suffixtest_ef_hits': f'("{task} test"[TIAB]) AND ("executive function")' } +def csv_header(): + headers = ['task','concept','timestamp_ms'] + headers += build_queries('','').keys() + return ','.join(headers) +def query_pubmed_and_create_csv(tasks, concepts, csv_file): -def query_pubmed_for_task(task, concept): - suffixes = ['',' task',' game',' test'] - task_queries = map(lambda s: task+s, suffixes) + try: data = pd.read_csv(csv_file) + except: data = pd.DataFrame({'task':[],'concept':[]}) - suffixed_hits = [] - hits = [] - for tq in task_queries: - query = f"({tq}[TIAB]) AND ({concept}[TIAB])" - pmids = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, pmc_only=False) - suffixed_hits += pmids - if tq == task: hits = pmids + with open(csv_file, "a+",buffering=1) as csv: + if len(csv.read()) == 0: + print("Empty csv found, generating csv headers...") + csv.write(csv_header() + '\n') - return (hits, suffixed_hits) + for task, concept in [(task, concept) for task in tasks for concept in concepts]: + previously_stored = data[(data.task == task) & (data.concept == concept)] + if previously_stored.empty: + millis = int(round(time.time() * 1000)) + csv_cells = [task, concept, str(millis)] + for qkey, query in build_queries(task, concept).items(): + qhits = fetcher.pmids_for_query(query=f'{query}', retmax=1000000, pmc_only=False) + csv_cells += [f"{len(qhits)}"] + + csv_line = ','.join(csv_cells) + '\n' + + print(csv_line) + csv.write(csv_line) -# main loop - -with open("data/efo_taskconcept_pubmed_hits.csv", "a",buffering=1) as csv: - csv.write('task,concept,hits,suffixed_hits,concept_hits,timestamp_ms\n') - for task, concept in [(task, concept) for task in tasks for concept in concepts]: - millis = int(round(time.time() * 1000)) - - hits, suffixed_hits = query_pubmed_for_task(task, concept) - - concept_query = f"({concept}[TIAB])" - concept_hits = fetcher.pmids_for_query(query=f'{concept_query}', retmax=1000000, pmc_only=False) - - csv_line = f'{task},{concept},{len(hits)},{len(suffixed_hits)},{len(concept_hits)},{millis}\n' - - print(csv_line) - csv.write(csv_line) - - -#%% -# Partial lookup: only queries pubmed if a combination of task-concept is not already fetched. -import pandas as pd - -csv_file = "data/efo_taskconcept_pubmed_hits.csv" - -fetcher = PubMedFetcher() -data = pd.read_csv(csv_file) - -with open("data/efo_taskconcept_pubmed_hits.csv", "a",buffering=1) as csv: - for task, concept in [(task, concept) for task in tasks for concept in concepts]: - task_df = data[(data.task == task) & (data.concept == concept)] - if task_df.empty: - millis = int(round(time.time() * 1000)) - - hits, suffixed_hits = query_pubmed_for_task(task, concept) - - concept_query = f"({concept}[TIAB])" - concept_hits = fetcher.pmids_for_query(query=f'{concept_query}', retmax=1000000, pmc_only=False) - - csv_line = f'{task},{concept},{len(hits)},{len(suffixed_hits)},{len(concept_hits)},{millis}\n' - - print(csv_line) - csv.write(csv_line) \ No newline at end of file +query_pubmed_and_create_csv(tasks, concepts, "/Users/morteza/workspace/notebooks/efo/data/efo_pubmed_hits.v2.csv")