diff --git a/.gitignore b/.gitignore index 24181f6..35d62ca 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ ._* tmp/ /.ipynb_checkpoints/ -*.pyc \ No newline at end of file +*.pyc +data/pubmed/*.xml diff --git a/.gitignore b/.gitignore index 24181f6..35d62ca 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ ._* tmp/ /.ipynb_checkpoints/ -*.pyc \ No newline at end of file +*.pyc +data/pubmed/*.xml diff --git a/data/cognitive_atlas/concepts.pkl b/data/cognitive_atlas/concepts.pkl new file mode 100644 index 0000000..4a8ccda --- /dev/null +++ b/data/cognitive_atlas/concepts.pkl Binary files differ diff --git a/.gitignore b/.gitignore index 24181f6..35d62ca 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ ._* tmp/ /.ipynb_checkpoints/ -*.pyc \ No newline at end of file +*.pyc +data/pubmed/*.xml diff --git a/data/cognitive_atlas/concepts.pkl b/data/cognitive_atlas/concepts.pkl new file mode 100644 index 0000000..4a8ccda --- /dev/null +++ b/data/cognitive_atlas/concepts.pkl Binary files differ diff --git a/data/cognitive_atlas/tasks.pkl b/data/cognitive_atlas/tasks.pkl new file mode 100644 index 0000000..09092bc --- /dev/null +++ b/data/cognitive_atlas/tasks.pkl Binary files differ diff --git a/.gitignore b/.gitignore index 24181f6..35d62ca 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ ._* tmp/ /.ipynb_checkpoints/ -*.pyc \ No newline at end of file +*.pyc +data/pubmed/*.xml diff --git a/data/cognitive_atlas/concepts.pkl b/data/cognitive_atlas/concepts.pkl new file mode 100644 index 0000000..4a8ccda --- /dev/null +++ b/data/cognitive_atlas/concepts.pkl Binary files differ diff --git a/data/cognitive_atlas/tasks.pkl b/data/cognitive_atlas/tasks.pkl new file mode 100644 index 0000000..09092bc --- /dev/null +++ b/data/cognitive_atlas/tasks.pkl Binary files differ diff --git a/py/20191225_cognitiveatlas.py b/py/20191225_cognitiveatlas.py new file mode 100644 index 0000000..3ebc575 --- /dev/null +++ b/py/20191225_cognitiveatlas.py @@ -0,0 +1,113 @@ +#%% [markdown] + +# the following code retrives all task names and cognitive concepts from cognitive atlas and stores them as a file. +# use `pip install cognitiveatlas` to install required packages. + +#%% get list of all tasks + +from cognitiveatlas.api import search, get_task, get_concept +import pandas as pd +from datetime import date + +import os +os.environ['NCBI_API_KEY'] = '751ff4edfab973bd0bc913ee84a0062bf009' + +tasks = get_task().pandas +concepts = get_concept().pandas + +tasks.to_pickle('data/cognitive_atlas/tasks.pkl') +concepts.to_pickle('data/cognitive_atlas/concepts.pkl') + +print(len(tasks.index)) +print(len(concepts.index)) + +#%% +import requests +import xmltodict + +base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' + +def search_and_store(term, output_file, db='pubmed', api_key=os.environ['NCBI_API_KEY']): + """Search for a term and store all abstract in a file + """ + search_query = f'({term}[TIAB])' + url = f'{base}esearch.fcgi' + params = { + 'term': search_query.replace(' ','+'), + 'usehistory': 'y', + 'db': db, + 'retmax': 0, + 'reldate': 10 * 365, + 'api_key': api_key + } + + response = requests.get(url,params=params) + search_response = xmltodict.parse(response.text) + + #DEBUG print(search_response) + + _num_of_results = search_response['eSearchResult']['Count'] + + print(f"Succesfully searched and stored results for '{term}' on history server.\nNow retriving {_num_of_results} abstracts...") + + + # --- FETCH ABSRACTS + url = f'{base}efetch.fcgi' + params = { + 'db': db, + 'api_key': api_key, + 'WebEnv': search_response['eSearchResult']['WebEnv'], + 'query_key': search_response['eSearchResult']['QueryKey'], + 'rettype': 'abstract', + 'retmode': 'xml' + } + + response = requests.post(url, params) + + with open(f'{output_file}', 'w') as f: + f.write(response.text) + + print(f'Succesfully stored results to {output_file}') + + return None + + +def fetch_pubs(cogat_obj): + search_and_store(cogat_obj['name'], f"data/pubmed/{cogat_obj['id']}.xml") + +_ = tasks.apply(fetch_pubs, axis=1) +_ = concepts.apply(fetch_pubs, axis=1) +# %% Remove empty resultsets +import glob +import os + +def has_result(xml_file): + with open(xml_file) as f: + content = xmltodict.parse(f.read()) + print(xml_file, 'PubmedArticleSet' in content) + return ('PubmedArticleSet' in content) + + +for file_path in glob.glob('data/pubmed/*.xml'): + if not has_result(file_path): + print(f'Found an empty resultset, removing {file_path}...') + os.remove(file_path) + +#%% list of tasks and concepts without any result from pubmed and filter them out + +#%% count articles per task and concept + +#%% for each task, count related concepts + +#%% for each concept, count related tasks + +#%% count total articles + +#%% count unrelated articles for each task and concept + +#%% frequency tables for all task-concept pairs + +#%% measure task-task similarity + +#%% measure concept-concept similarity +