Newer
Older
notebooks / efo / 20191223_pubmed.py
#%% temp (REMOVE this, this snippet sets an env var. That's it!)
import os
os.environ['NCBI_API_KEY'] = '<PUT_YOUR_NCBI_API_KEY_HERE>'

#%% direct eutils xml requests with history support
#Note: pip install xmltodict
import os
from datetime import date
import requests
import xmltodict

base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
ts = date.today().strftime('%Y%m%d')


terms = ['Digit Span', 'Working Memory']
db = 'pubmed' # or 'pmc'


def search_and_store(term, output_file, db='pubmed', api_key=os.environ['NCBI_API_KEY']):
  """Search for a term and store all abstract in a file
  """
  search_query = f'({term}[TIAB])'
  url = f'{base}esearch.fcgi'
  params = {
    'term': search_query.replace(' ','+'),
    'usehistory': 'y',
    'db': db,
    'retmax': 0,
    'reldate': 10 * 365,
    'api_key': api_key
  }

  response = requests.get(url,params=params)
  search_response = xmltodict.parse(response.text)

  #DEBUG print(search_response)

  _num_of_results = search_response['eSearchResult']['Count']

  print(f'Succesfully searched and stored results on history server.\nNow retriving {_num_of_results} abstracts...')


  # --- FETCH ABSRACTS
  url = f'{base}efetch.fcgi'
  params = {
    'db': db,
    'api_key': api_key,
    'WebEnv': search_response['eSearchResult']['WebEnv'],
    'query_key': search_response['eSearchResult']['QueryKey'],
    'rettype': 'abstract',
    'retmode': 'xml'
  }

  response = requests.post(url, params)

  with open(f'{output_file}', 'w') as f:
    f.write(response.text)

  print(f'Succesfully stored results to {output_file}')

  return None



for term in terms:
  print(f'searching NCBI for: {term}...')
  search_and_store(term, db=db, output_file = f'data/{db}/{ts}_{db}_{term}.xml')


#%% [markdown]
# DEPRECATED: POST ids to history server
# The following code posts a list of ids to the history server and retrives parameters to fetch abstracts of the articles. Although it helps to avoid facing the NCBI query limitations, the same functionality can be achieved with esearch/usehistory + efetch.

#%% POST

url = f"{base}epost.fcgi"

params = {
  'db': db,
  'id': ','.join(map(str, [11237011,12466850])),
  'api_key': os.environ['NCBI_API_KEY']
}

response = requests.post(url, params)

history_params = xmltodict.parse(response.text)

#%% [markdown]
# ## DEPRECATED: metapub
# The following snippet shows how to use metapub package to retrive a list of records for a given search query. It's limited by the number of returned records and limited requests rate per second (10 queries/s with an API_KEY). The code also requires `metapub` package, to be install with `pip install metapub`.

# Note: metapub requires an env variable named `NCBI_API_KEY`.

#%% metapub

import os
from metapub import PubMedFetcher

terms = ['N-Back', 'Working Memory']

fetcher = PubMedFetcher()

for term in terms:
  abstracts = []

  ids = fetcher.pmids_for_query(query=f'({term}[TIAB])', retmax=1000000, since='2010', pmc_only=True)
  print(f'fetching articles for {term}')

  for index, id in enumerate(ids[:10]):
    print(f'{index} of {len(ids)}...')
    article = fetcher.article_by_pmid(id)
    if article.abstract is not None:
      abstracts.append(article.pmid + '\n' + article.title + '\n' + article.abstract)

  with open(f'data/{db}/{term}.txt','w') as f:
    f.write('\n\n'.join(abstracts))