Source code for indra.sources.reach.reach_api

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str, bytes
import os
import json
import logging
import tempfile
import requests
from indra.literature import id_lookup
import indra.literature.pmc_client as pmc_client
import indra.literature.pubmed_client as pubmed_client
from .processor import ReachProcessor
# Python 2
try:
    basestring
# Python 3
except:
    basestring = str

logger = logging.getLogger('reach')

try:
    # For offline reading
    from indra.java_vm import autoclass, JavaException
    from .reach_reader import ReachReader
    reach_reader = ReachReader()
    try_offline = True
except Exception:
    logger.error('Could not import jnius, offline reading cannot be used.')
    try_offline = False

reach_text_url = 'http://agathon.sista.arizona.edu:8080/odinweb/api/text'
reach_nxml_url = 'http://agathon.sista.arizona.edu:8080/odinweb/api/nxml'


[docs]def process_pmc(pmc_id, offline=False):
    """Return a ReachProcessor by processing a paper with a given PMC id.

    Uses the PMC client to obtain the full text. If it's not available,
    None is returned.

    Parameters
    ----------
    pmc_id : str
        The ID of a PubmedCentral article. The string may start with PMC but
        passing just the ID also works.
        Examples: 3717945, PMC3717945
        https://www.ncbi.nlm.nih.gov/pmc/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    xml_str = pmc_client.get_xml(pmc_id)
    if xml_str is None:
        return None
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
    ids = id_lookup(pmc_id, 'pmcid')
    pmid = ids.get('pmid')
    rp = process_nxml_file(fname, citation=pmid, offline=offline)
    return rp


[docs]def process_pubmed_abstract(pubmed_id, offline=False):
    """Return a ReachProcessor by processing an abstract with a given Pubmed id.

    Uses the Pubmed client to get the abstract. If that fails, None is
    returned.

    Parameters
    ----------
    pubmed_id : str
        The ID of a Pubmed article. The string may start with PMID but
        passing just the ID also works.
        Examples: 27168024, PMID27168024
        https://www.ncbi.nlm.nih.gov/pubmed/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    abs_txt = pubmed_client.get_abstract(pubmed_id)
    if abs_txt is None:
        return None
    rp = process_text(abs_txt, citation=pubmed_id, offline=offline)
    if rp and rp.statements:
        for st in rp.statements:
            for ev in st.evidence:
                ev.epistemics['section_type'] = 'abstract'
    return rp


[docs]def process_text(text, citation=None, offline=False):
    """Return a ReachProcessor by processing the given text.

    Parameters
    ----------
    text : str
        The text to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. This is used when the text to be processed comes from
        a publication that is not otherwise identified. Default: None
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    if offline:
        if not try_offline:
            logger.error('Offline reading is not available.')
            return None
        api_ruler = reach_reader.get_api_ruler()
        if api_ruler is None:
            logger.error('Cannot read offline because the REACH ApiRuler ' + \
                         'could not be instantiated.')
            return None
        try:
            result_map = api_ruler.annotateText(text, 'fries')
        except JavaException as e:
            logger.error('Could not process text.')
            logger.error(e)
            return None
        json_str = result_map.get('resultJson')
        if not isinstance(json_str, bytes):
            json_str = json_str.encode('utf-8')
    else:
        data = {'text': text.encode('utf-8')}
        try:
            res = requests.post(reach_text_url, data)
        except requests.exceptions.RequestException as e:
            logger.error('Could not connect to REACH service:')
            logger.error(e)
            return None
        # TODO: we could use res.json() here to get a dict 
        # directly
        # This is a byte string
        json_str = res.content
    assert isinstance(json_str, bytes)
    with open('reach_output.json', 'wb') as fh:
        fh.write(json_str)
    return process_json_str(json_str.decode('utf-8'), citation)


[docs]def process_nxml_str(nxml_str, citation=None, offline=False):
    """Return a ReachProcessor by processing the given NXML string.

    NXML is the format used by PubmedCentral for papers in the open
    access subset.

    Parameters
    ----------
    nxml_str : str
        The NXML string to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    if offline:
        if not try_offline:
            logger.error('Offline reading is not available.')
            return None
        api_ruler = reach_reader.get_api_ruler()
        if api_ruler is None:
            logger.error('Cannot read offline because the REACH ApiRuler' +\
                         'could not be instantiated.')
            return None
        try:
            result_map = api_ruler.annotateNxml(nxml_str, 'fries')
        except JavaException as e:
            logger.error('Could not process NXML.')
            logger.error(e)
            return None
        json_str = result_map.get('resultJson')
        if isinstance(json_str, bytes):
            json_str = json_str.decode('utf-8')
        return process_json_str(json_str, citation)
        rp = process_nxml_file(fname, citation, True)
        return rp
    else:
        data = {'nxml': nxml_str}
        try:
            res = requests.post(reach_nxml_url, data)
        except requests.exceptions.RequestException as e:
            logger.error('Could not connect to REACH service:')
            logger.error(e)
            return None
        if res.status_code != 200:
            logger.error('Could not process NXML via REACH service.' + \
                         'Status code: %d' % res.status_code)
            return None
        json_str = res.text
        with open('reach_output.json', 'wb') as fh:
            fh.write(json_str.encode('utf-8'))
        return process_json_str(json_str, citation)


[docs]def process_nxml_file(file_name, citation=None, offline=False):
    """Return a ReachProcessor by processing the given NXML file.

    NXML is the format used by PubmedCentral for papers in the open
    access subset.

    Parameters
    ----------
    file_name : str
        The name of the NXML file to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    with open(file_name, 'rb') as f:
        nxml_str = f.read().decode('utf-8')
        return process_nxml_str(nxml_str, citation, False)

[docs]def process_json_file(file_name, citation=None):
    """Return a ReachProcessor by processing the given REACH json file.

    The output from the REACH parser is in this json format. This function is
    useful if the output is saved as a file and needs to be processed.
    For more information on the format, see: https://github.com/clulab/reach

    Parameters
    ----------
    file_name : str
        The name of the json file to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    try:
        with open(file_name, 'rb') as fh:
            json_str = fh.read().decode('utf-8')
            return process_json_str(json_str, citation)
    except IOError:
        logger.error('Could not read file %s.' % file_name)


[docs]def process_json_str(json_str, citation=None):
    """Return a ReachProcessor by processing the given REACH json string.

    The output from the REACH parser is in this json format.
    For more information on the format, see: https://github.com/clulab/reach

    Parameters
    ----------
    json_str : str
        The json string to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    assert isinstance(json_str, basestring)
    json_str = json_str.replace('frame-id','frame_id')
    json_str = json_str.replace('argument-label','argument_label')
    json_str = json_str.replace('object-meta','object_meta')
    json_str = json_str.replace('doc-id','doc_id')
    json_str = json_str.replace('is-hypothesis','is_hypothesis')
    json_str = json_str.replace('is-negated','is_negated')
    json_str = json_str.replace('is-direct','is_direct')
    json_str = json_str.replace('found-by','found_by')
    try:
        json_dict = json.loads(json_str)
    except ValueError:
        logger.error('Could not decode JSON string.')
        return None
    rp = ReachProcessor(json_dict, citation)
    rp.get_modifications()
    rp.get_complexes()
    rp.get_activation()
    rp.get_translocation()
    rp.get_regulate_amounts()
    return rp