Newer
Older
notebooks / py / 20191225_cognitiveatlas.py
#%% [markdown]

# the following code retrives all task names and cognitive concepts from cognitive atlas and stores them as a file.
# use `pip install cognitiveatlas` to install required packages.

#%% get list of all tasks

from cognitiveatlas.api import search, get_task, get_concept
import pandas as pd
from datetime import date

import os
os.environ['NCBI_API_KEY'] = '751ff4edfab973bd0bc913ee84a0062bf009'

tasks = get_task().pandas
concepts = get_concept().pandas

tasks.to_pickle('data/cognitive_atlas/tasks.pkl')
concepts.to_pickle('data/cognitive_atlas/concepts.pkl')

print(len(tasks.index))
print(len(concepts.index))

#%%
import requests
import xmltodict

base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'

def search_and_store(term, output_file, db='pubmed', api_key=os.environ['NCBI_API_KEY']):
  """Search for a term and store all abstract in a file
  """
  search_query = f'({term}[TIAB])'
  url = f'{base}esearch.fcgi'
  params = {
    'term': search_query.replace(' ','+'),
    'usehistory': 'y',
    'db': db,
    'retmax': 0,
    'reldate': 10 * 365,
    'api_key': api_key
  }

  response = requests.get(url,params=params)
  search_response = xmltodict.parse(response.text)

  #DEBUG print(search_response)

  _num_of_results = search_response['eSearchResult']['Count']

  print(f"Succesfully searched and stored results for '{term}' on history server.\nNow retriving {_num_of_results} abstracts...")


  # --- FETCH ABSRACTS
  url = f'{base}efetch.fcgi'
  params = {
    'db': db,
    'api_key': api_key,
    'WebEnv': search_response['eSearchResult']['WebEnv'],
    'query_key': search_response['eSearchResult']['QueryKey'],
    'rettype': 'abstract',
    'retmode': 'xml'
  }

  response = requests.post(url, params)

  with open(f'{output_file}', 'w') as f:
    f.write(response.text)

  print(f'Succesfully stored results to {output_file}')

  return None


def fetch_pubs(cogat_obj):
  search_and_store(cogat_obj['name'], f"data/pubmed/{cogat_obj['id']}.xml")

_ = tasks.apply(fetch_pubs, axis=1)
_ = concepts.apply(fetch_pubs, axis=1)
# %% Remove empty resultsets
import glob
import os

def has_result(xml_file):
  with open(xml_file) as f:
    content = xmltodict.parse(f.read())
    print(xml_file,  'PubmedArticleSet' in content)
    return ('PubmedArticleSet' in content)


for file_path in glob.glob('data/pubmed/*.xml'):
  if not has_result(file_path):
    print(f'Found an empty resultset, removing {file_path}...')
    os.remove(file_path)

#%% list of tasks and concepts without any result from pubmed and filter them out

#%% count articles per task and concept

#%% for each task, count related concepts

#%% for each concept, count related tasks

#%% count total articles

#%% count unrelated articles for each task and concept

#%% frequency tables for all task-concept pairs

#%% measure task-task similarity

#%% measure concept-concept similarity