#%% [markdown] # the following code retrives all task names and cognitive concepts from cognitive atlas and stores them as a file. # use `pip install cognitiveatlas` to install required packages. #%% get list of all tasks from cognitiveatlas.api import search, get_task, get_concept import pandas as pd from datetime import date import os os.environ['NCBI_API_KEY'] = '751ff4edfab973bd0bc913ee84a0062bf009' tasks = get_task().pandas concepts = get_concept().pandas tasks.to_pickle('data/cognitive_atlas/tasks.pkl') concepts.to_pickle('data/cognitive_atlas/concepts.pkl') print(len(tasks.index)) print(len(concepts.index)) #%% import requests import xmltodict base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' def search_and_store(term, output_file, db='pubmed', api_key=os.environ['NCBI_API_KEY']): """Search for a term and store all abstract in a file """ search_query = f'({term}[TIAB])' url = f'{base}esearch.fcgi' params = { 'term': search_query.replace(' ','+'), 'usehistory': 'y', 'db': db, 'retmax': 0, 'reldate': 10 * 365, 'api_key': api_key } response = requests.get(url,params=params) search_response = xmltodict.parse(response.text) #DEBUG print(search_response) _num_of_results = search_response['eSearchResult']['Count'] print(f"Succesfully searched and stored results for '{term}' on history server.\nNow retriving {_num_of_results} abstracts...") # --- FETCH ABSRACTS url = f'{base}efetch.fcgi' params = { 'db': db, 'api_key': api_key, 'WebEnv': search_response['eSearchResult']['WebEnv'], 'query_key': search_response['eSearchResult']['QueryKey'], 'rettype': 'abstract', 'retmode': 'xml' } response = requests.post(url, params) with open(f'{output_file}', 'w') as f: f.write(response.text) print(f'Succesfully stored results to {output_file}') return None def fetch_pubs(cogat_obj): search_and_store(cogat_obj['name'], f"data/pubmed/{cogat_obj['id']}.xml") _ = tasks.apply(fetch_pubs, axis=1) _ = concepts.apply(fetch_pubs, axis=1) # %% Remove empty resultsets import glob import os def has_result(xml_file): with open(xml_file) as f: content = xmltodict.parse(f.read()) print(xml_file, 'PubmedArticleSet' in content) return ('PubmedArticleSet' in content) for file_path in glob.glob('data/pubmed/*.xml'): if not has_result(file_path): print(f'Found an empty resultset, removing {file_path}...') os.remove(file_path) #%% list of tasks and concepts without any result from pubmed and filter them out #%% count articles per task and concept #%% for each task, count related concepts #%% for each concept, count related tasks #%% count total articles #%% count unrelated articles for each task and concept #%% frequency tables for all task-concept pairs #%% measure task-task similarity #%% measure concept-concept similarity