#%% [markdown]
# the following code retrives all task names and cognitive concepts from cognitive atlas and stores them as a file.
# use `pip install cognitiveatlas` to install required packages.
#%% get list of all tasks
from cognitiveatlas.api import search, get_task, get_concept
import pandas as pd
from datetime import date
import os
os.environ['NCBI_API_KEY'] = '751ff4edfab973bd0bc913ee84a0062bf009'
tasks = get_task().pandas
concepts = get_concept().pandas
tasks.to_pickle('data/cognitive_atlas/tasks.pkl')
concepts.to_pickle('data/cognitive_atlas/concepts.pkl')
print(len(tasks.index))
print(len(concepts.index))
#%%
import requests
import xmltodict
base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
def search_and_store(term, output_file, db='pubmed', api_key=os.environ['NCBI_API_KEY']):
"""Search for a term and store all abstract in a file
"""
search_query = f'({term}[TIAB])'
url = f'{base}esearch.fcgi'
params = {
'term': search_query.replace(' ','+'),
'usehistory': 'y',
'db': db,
'retmax': 0,
'reldate': 10 * 365,
'api_key': api_key
}
response = requests.get(url,params=params)
search_response = xmltodict.parse(response.text)
#DEBUG print(search_response)
_num_of_results = search_response['eSearchResult']['Count']
print(f"Succesfully searched and stored results for '{term}' on history server.\nNow retriving {_num_of_results} abstracts...")
# --- FETCH ABSRACTS
url = f'{base}efetch.fcgi'
params = {
'db': db,
'api_key': api_key,
'WebEnv': search_response['eSearchResult']['WebEnv'],
'query_key': search_response['eSearchResult']['QueryKey'],
'rettype': 'abstract',
'retmode': 'xml'
}
response = requests.post(url, params)
with open(f'{output_file}', 'w') as f:
f.write(response.text)
print(f'Succesfully stored results to {output_file}')
return None
def fetch_pubs(cogat_obj):
search_and_store(cogat_obj['name'], f"data/pubmed/{cogat_obj['id']}.xml")
_ = tasks.apply(fetch_pubs, axis=1)
_ = concepts.apply(fetch_pubs, axis=1)
# %% Remove empty resultsets
import glob
import os
def has_result(xml_file):
with open(xml_file) as f:
content = xmltodict.parse(f.read())
print(xml_file, 'PubmedArticleSet' in content)
return ('PubmedArticleSet' in content)
for file_path in glob.glob('data/pubmed/*.xml'):
if not has_result(file_path):
print(f'Found an empty resultset, removing {file_path}...')
os.remove(file_path)
#%% list of tasks and concepts without any result from pubmed and filter them out
#%% count articles per task and concept
#%% for each task, count related concepts
#%% for each concept, count related tasks
#%% count total articles
#%% count unrelated articles for each task and concept
#%% frequency tables for all task-concept pairs
#%% measure task-task similarity
#%% measure concept-concept similarity