Source code for indra.literature.elsevier_client

"""
For information on the Elsevier API, see:
  - API Specification: http://dev.elsevier.com/api_docs.html
  - Authentication: https://dev.elsevier.com/tecdoc_api_authentication.html
"""

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
import logging
import textwrap
import xml.etree.ElementTree as ET
import requests
# Python3
try:
    from functools import lru_cache
# Python2
except ImportError:
    from functools32 import lru_cache
from indra.util import read_unicode_csv
from indra.util import UnicodeXMLTreeBuilder as UTB

logger = logging.getLogger('elsevier')


# THE ELSEVIER API URL: ***MUST BE HTTPS FOR SECURITY***
elsevier_api_url = 'https://api.elsevier.com/content' # <--- HTTPS
elsevier_article_url = '%s/article/doi' % elsevier_api_url
elsevier_search_url = '%s/search/scidir' % elsevier_api_url
elsevier_entitlement_url = '%s/article/entitlement/doi' % elsevier_api_url

# Namespaces for Elsevier XML elements
elsevier_ns = {'dc': 'http://purl.org/dc/elements/1.1/',
               'article': 'http://www.elsevier.com/xml/svapi/article/dtd',
               'ja': 'http://www.elsevier.com/xml/ja/dtd',
               'xocs': 'http://www.elsevier.com/xml/xocs/dtd',
               'common': 'http://www.elsevier.com/xml/common/dtd',
               'atom': 'http://www.w3.org/2005/Atom',
               'prism': 'http://prismstandard.org/namespaces/basic/2.0/'}

# THE API KEY IS NOT UNDER VERSION CONTROL FOR SECURITY
# For more information see http://dev.elsevier.com/
api_key_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            'elsevier_api_keys')
api_key_env_name = 'ELSEVIER_API_KEY'
inst_key_env_name = 'ELSEVIER_INST_KEY'

# Try to read the API key from a file
try:
    elsevier_keys = dict(read_unicode_csv(api_key_file))
    # Check whether the institution key is present
    if not elsevier_keys.get('X-ELS-Insttoken'):
        logger.info('Optional institution key X-ELS-Insttoken not found in '
                    'elsevier key file.')
    # Check that the API key entry has the right name
    if not elsevier_keys.get('X-ELS-APIKey'):
        logger.error('API key X-ELS-APIKey not found in elsevier key file.')
        elsevier_keys = None
except IOError:
    logger.warning('Elsevier API keys file could not be read, trying '
                   'environment variables $%s and $%s.' %
                   (api_key_env_name, inst_key_env_name))
    logger.debug('Tried key file: %s' % api_key_file)
    # Try the environment variable for the api key. This one is optional,
    # so if it is not found then we just leave it out of the keys dict
    elsevier_keys = {}
    if inst_key_env_name in os.environ:
        elsevier_keys['X-ELS-Insttoken'] = os.environ.get(inst_key_env_name)
    else:
        logger.info('No Elsevier institution key found in environment '
                    'variable %s.' % inst_key_env_name)
    # Try the environment variable for the api key. This one is required, so
    # if it is not found then we set the keys dict to None
    if api_key_env_name in os.environ:
        elsevier_keys['X-ELS-APIKey'] = os.environ.get(api_key_env_name)
    else:
        logger.warning('No Elsevier API key found in environment variable '
                     '%s.' % api_key_env_name)
        elsevier_keys = None


def check_entitlement(doi):
    if elsevier_keys is None:
        logger.error('Missing API key, could not check article entitlement.')
        return False
    if doi.lower().startswith('doi:'):
        doi = doi[4:]
    url = '%s/%s' % (elsevier_entitlement_url, doi)
    params = {'httpAccept': 'text/xml'}
    res = requests.get(url, params, headers=elsevier_keys)
    if not res.status_code == 200:
        logger.error('Could not check entitlements for article %s: '
                     'status code %d' % (doi, res.status_code))
        logger.error('Response content: %s' % res.text)
        return False


[docs]def download_article(doi): """Download an article in XML format from Elsevier.""" if elsevier_keys is None: logger.error('Missing API key, could not download article.') return None if doi.lower().startswith('doi:'): doi = doi[4:] url = '%s/%s' % (elsevier_article_url, doi) params = {'httpAccept': 'text/xml'} res = requests.get(url, params, headers=elsevier_keys) if not res.status_code == 200: logger.error('Could not download article %s: status code %d' % (doi, res.status_code)) logger.error('Elsevier response: %s' % res.text) return None # Return the XML content as a unicode string, assuming UTF-8 encoding return res.content.decode('utf-8')
[docs]def get_abstract(doi): """Get the abstract of an article from Elsevier.""" xml_string = download_article(doi) if xml_string is None: return None assert isinstance(xml_string, str) # Build XML ElementTree xml_tree = ET.XML(xml_string.encode('utf-8'), parser=UTB()) if xml_tree is None: return None coredata = xml_tree.find('article:coredata', elsevier_ns) abstract = coredata.find('dc:description', elsevier_ns) abs_text = abstract.text return abs_text
[docs]def get_article(doi, output='txt'): """Get the full body of an article from Elsevier. There are two output modes: 'txt' strips all xml tags and joins the pieces of text in the main text, while 'xml' simply takes the tag containing the body of the article and returns it as is . In the latter case, downstream code needs to be able to interpret Elsever's XML format. """ xml_string = download_article(doi) text = extract_text(xml_string) return text
def extract_text(xml_string): if xml_string is None: return None #with open('/Users/johnbachman/Desktop/elsevier.xml', 'wb') as f: # f.write(xml_string.encode('utf-8')) assert isinstance(xml_string, str) # Build XML ElementTree xml_tree = ET.XML(xml_string.encode('utf-8'), parser=UTB()) # Look for full text element full_text = xml_tree.find('article:originalText', elsevier_ns) if full_text is None: logger.info('Could not find full text element article:originalText') return None article_body = _get_article_body(full_text) if article_body: return article_body raw_text = _get_raw_text(full_text) if raw_text: return raw_text #pdf = _get_pdf_attachment(full_text) #if pdf: # return pdf return None def _get_pdf_attachment(full_text_elem): attachments = full_text_elem.findall('xocs:doc/xocs:meta/' 'xocs:attachment-metadata-doc/' 'xocs:attachments', elsevier_ns) for att_elem in attachments: web_pdf = att_elem.find('xocs:web-pdf', elsevier_ns) if web_pdf is None: continue # Check for a MAIN pdf pdf_purpose = web_pdf.find('xocs:web-pdf-purpose', elsevier_ns) if not pdf_purpose.text == 'MAIN': continue else: return 'pdf' #locations = web_pdf.findall('xocs:ucs-locator', elsevier_ns) #for loc in locations: # logger.info("PDF location: %s" % loc.text) #logger.info('Could not find PDF attachment.') return None def _get_article_body(full_text_elem): # Look for ja:article main_body = full_text_elem.find('xocs:doc/xocs:serial-item/' 'ja:article/ja:body', elsevier_ns) if main_body is not None: logger.info("Found main body element " "xocs:doc/xocs:serial-item/ja:article/ja:body") return _get_sections(main_body) logger.info("Could not find main body element " "xocs:doc/xocs:serial-item/ja:article/ja:body") # If no luck with ja:article, try ja:converted_article main_body = full_text_elem.find('xocs:doc/xocs:serial-item/' 'ja:converted-article/ja:body', elsevier_ns) if main_body is not None: logger.info("Found main body element " "xocs:doc/xocs:serial-item/ja:converted-article/ja:body") return _get_sections(main_body) logger.info("Could not find main body element " "xocs:doc/xocs:serial-item/ja:converted-article/ja:body") # If we haven't returned by this point, then return None return None def _get_sections(main_body_elem): # Get content sections sections = main_body_elem.findall('common:sections/common:section', elsevier_ns) if len(sections) == 0: logger.info("Found no sections in main body") return None # Concatenate the section content full_txt = '' for s in sections: # Paragraphs that are directly under the section pars = s.findall('common:para', elsevier_ns) # Paragraphs that are under a section within the section pars += s.findall('common:section/common:para', elsevier_ns) for p in pars: # Get the initial string inside the paragraph if p.text is not None: full_txt += p.text # When there are tags inside the paragraph (for instance # references), we need to take those child elements one by one # and get the corresponding tail strings and join these. full_txt += ''.join([c.tail if c.tail is not None else '' for c in p.getchildren()]) full_txt += '\n' return full_txt def _get_raw_text(full_text_elem): # Look for raw_text raw_text = full_text_elem.find('xocs:doc/xocs:rawtext', elsevier_ns) if raw_text is None: logger.info("Could not find rawtext element xocs:doc/xocs:rawtext") return None else: logger.info("Found rawtext element xocs:doc/xocs:rawtext") return textwrap.fill(raw_text.text) @lru_cache(maxsize=100)
[docs]def get_dois(query_str, count=100): """Search ScienceDirect through the API for articles. See http://api.elsevier.com/content/search/fields/scidir for constructing a query string to pass here. Example: 'abstract(BRAF) AND all("colorectal cancer")' """ url = '%s/%s' % (elsevier_search_url, query_str) if elsevier_keys is None: logger.error('Missing API key at %s, could not perform search.' % api_key_file) return None params = {'query': query_str, 'count': count, 'httpAccept': 'application/xml', 'sort': '-coverdate', 'field': 'doi'} res = requests.get(url, params) if not res.status_code == 200: return None tree = ET.XML(res.content, parser=UTB()) doi_tags = tree.findall('atom:entry/prism:doi', elsevier_ns) dois = [dt.text for dt in doi_tags] return dois