Source code for indra.sources.reach.api

"""Methods for obtaining a reach processor containing indra statements.

Many file formats are supported. Many will run reach.
"""
import json
import logging
import requests

from indra.literature import id_lookup
import indra.literature.pmc_client as pmc_client
import indra.literature.pubmed_client as pubmed_client
from .processor import ReachProcessor


logger = logging.getLogger(__name__)

try:
    # For offline reading
    from .reader import ReachReader, ReachOfflineReadingError, JavaException
    reach_reader = ReachReader()
    try_offline = True
except Exception as e:
    logger.warning('Could not import jnius, offline reading option will not '
                   'be available.')
    logger.debug(e)
    try_offline = False

reach_text_url = 'http://agathon.sista.arizona.edu:8080/odinweb/api/text'
reach_nxml_url = 'http://agathon.sista.arizona.edu:8080/odinweb/api/nxml'
local_text_url = 'http://localhost:8080/api/text'
local_nxml_url = 'http://localhost:8080/api/uploadFile'
default_output_fname = 'reach_output.json'


[docs]def process_pmc(pmc_id, offline=False, url=None,
                output_fname=default_output_fname,
                organism_priority=None):
    """Return a ReachProcessor by processing a paper with a given PMC id.

    Uses the PMC client to obtain the full text. If it's not available,
    None is returned.

    Parameters
    ----------
    pmc_id : str
        The ID of a PubmedCentral article. The string may start with PMC but
        passing just the ID also works.
        Examples: 8511698, PMC8511698
        https://www.ncbi.nlm.nih.gov/pmc/
    offline : Optional[bool]
        If set to True, the REACH system is run offline via a JAR file.
        Otherwise (by default) the web service is called. Default: False
    url : Optional[str]
        URL for a REACH web service instance, which is used for reading if
        provided. If not provided but offline is set to False (its default
        value), the Arizona REACH web service is called
        (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
        Default: None
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    # Loading content from PMC first
    logger.info('Loading %s from PMC' % pmc_id)
    xml_str = pmc_client.get_xml(pmc_id)
    if xml_str is None:
        return None
    # Write into a file in the working folder
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
    # Try to get the PMID for the paper so that the evidence pmid
    # attribute can be set correctly
    logger.info('Looking up PMID for %s' % pmc_id)
    ids = id_lookup(pmc_id, 'pmcid')
    pmid = ids.get('pmid')
    # Now process the NXML file with the provided arguments
    logger.info('Processing %s with REACH' % pmc_id)
    rp = process_nxml_file(fname, citation=pmid, offline=offline, url=url,
                           output_fname=output_fname,
                           organism_priority=organism_priority)
    return rp


[docs]def process_pubmed_abstract(pubmed_id, offline=False, url=None,
                            output_fname=default_output_fname, **kwargs):
    """Return a ReachProcessor by processing an abstract with a given Pubmed id.

    Uses the Pubmed client to get the abstract. If that fails, None is
    returned.

    Parameters
    ----------
    pubmed_id : str
        The ID of a Pubmed article. The string may start with PMID but
        passing just the ID also works.
        Examples: 27168024, PMID27168024
        https://www.ncbi.nlm.nih.gov/pubmed/
    offline : Optional[bool]
        If set to True, the REACH system is run offline via a JAR file.
        Otherwise (by default) the web service is called. Default: False
    url : Optional[str]
        URL for a REACH web service instance, which is used for reading if
        provided. If not provided but offline is set to False (its default
        value), the Arizona REACH web service is called
        (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
        Default: None
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.
    **kwargs : keyword arguments
        All other keyword arguments are passed directly to `process_text`.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    # Get the abstract from PubMed, if that fails, return None
    abs_txt = pubmed_client.get_abstract(pubmed_id)
    if abs_txt is None:
        return None
    # Process the text with the provided arguments
    rp = process_text(abs_txt, citation=pubmed_id, offline=offline, url=url,
                      output_fname=output_fname, **kwargs)
    # For some applications, the section type of the text is important so
    # that annotation is set here.
    if rp and rp.statements:
        for st in rp.statements:
            for ev in st.evidence:
                ev.epistemics['section_type'] = 'abstract'
    return rp


[docs]def process_text(text, citation=None, offline=False, url=None,
                 output_fname=default_output_fname, timeout=None,
                 organism_priority=None):
    """Return a ReachProcessor by processing the given text.

    Parameters
    ----------
    text : str
        The text to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. This is used when the text to be processed comes from
        a publication that is not otherwise identified. Default: None
    offline : Optional[bool]
        If set to True, the REACH system is run offline via a JAR file.
        Otherwise (by default) the web service is called. Default: False
    url : Optional[str]
        URL for a REACH web service instance, which is used for reading if
        provided. If not provided but offline is set to False (its default
        value), the Arizona REACH web service is called
        (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
        Default: None
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.
    timeout : Optional[float]
        This only applies when reading online (`offline=False`). Only wait for
        `timeout` seconds for the api to respond.
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    if offline:
        json_str = _read_content_offline(text, 'text')
    # If we are not reading offline then the old and new service interfaces
    # are the same so we can use a shared function
    else:
        if url is None:
            url = reach_text_url
        json_str = _read_text_service(text, url, timeout)

    if json_str:
        with open(output_fname, 'wb') as fh:
            fh.write(json_str)
        return process_json_str(json_str.decode('utf-8'), citation=citation,
                                organism_priority=organism_priority)


[docs]def process_nxml_str(nxml_str, citation=None, offline=False,
                     url=None, output_fname=default_output_fname,
                     organism_priority=None):
    """Return a ReachProcessor by processing the given NXML string.

    NXML is the format used by PubmedCentral for papers in the open
    access subset.

    Parameters
    ----------
    nxml_str : str
        The NXML string to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None
    offline : Optional[bool]
        If set to True, the REACH system is run offline via a JAR file.
        Otherwise (by default) the web service is called. Default: False
    url : Optional[str]
        URL for a REACH web service instance, which is used for reading if
        provided. If not provided but offline is set to False (its default
        value), the Arizona REACH web service is called
        (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
        Default: None
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    if offline:
        json_str = _read_content_offline(nxml_str, 'nxml')
    else:
        # Use the Arizona URL by default if not given
        if url is None:
            url = reach_nxml_url
        # Print warning but proceed with reading
        if url == reach_nxml_url:
            logger.warning('Remote REACH webservice might get stuck when ' +
                           'processing NXML. Running local instance of REACH' +
                           ' is recommended.')
            json_str = _read_nxml_str_service_old(nxml_str, url)
        # Otherwise we assume that the web service is more recent than the
        # Arizona one and requires the new protocol.
        else:
            with open('temp_file.nxml', 'wb') as f:
                f.write(nxml_str.encode('utf-8'))
            json_str = _read_nxml_file_service_new('temp_file.nxml', url)

    if json_str:
        with open(output_fname, 'wb') as fh:
            fh.write(json_str)
        return process_json_str(json_str.decode('utf-8'), citation=citation,
                                organism_priority=organism_priority)


[docs]def process_nxml_file(file_name, citation=None, offline=False,
                      url=None, output_fname=default_output_fname,
                      organism_priority=None):
    """Return a ReachProcessor by processing the given NXML file.

    NXML is the format used by PubmedCentral for papers in the open
    access subset.

    Parameters
    ----------
    file_name : str
        The name of the NXML file to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None
    offline : Optional[bool]
        If set to True, the REACH system is run offline via a JAR file.
        Otherwise (by default) the web service is called. Default: False
    url : Optional[str]
        URL for a REACH web service instance, which is used for reading if
        provided. If not provided but offline is set to False (its default
        value), the Arizona REACH web service is called
        (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
        Default: None
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    # First, if we are reading offline, we read the file and proceed
    if offline:
        with open(file_name, 'rb') as f:
            nxml_str = f.read().decode('utf-8')
            json_str = _read_content_offline(nxml_str, 'nxml')
    # If we are using the Arizona service, we use the old protocol
    elif url is None or url == reach_nxml_url:
        json_str = _read_nxml_file_service_old(file_name, url=reach_nxml_url)
    # Otherwise we use the new protocol
    else:
        json_str = _read_nxml_file_service_new(file_name, url=url)
    # Finally, we process the JSON output
    if json_str:
        with open(output_fname, 'wb') as fh:
            fh.write(json_str)
        return process_json_str(json_str.decode('utf-8'), citation=citation,
                                organism_priority=organism_priority)


[docs]def process_json_file(file_name, citation=None, organism_priority=None):
    """Return a ReachProcessor by processing the given REACH json file.

    The output from the REACH parser is in this json format. This function is
    useful if the output is saved as a file and needs to be processed.
    For more information on the format, see: https://github.com/clulab/reach

    Parameters
    ----------
    file_name : str
        The name of the json file to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    try:
        with open(file_name, 'rb') as fh:
            json_str = fh.read().decode('utf-8')
            return process_json_str(json_str, citation=citation,
                                    organism_priority=organism_priority)
    except IOError:
        logger.error('Could not read file %s.' % file_name)


[docs]def process_fries_json_group(group_prefix, citation=None,
                             organism_priority=None):
    """Return a ReachProcessor by processing a REACH fries output file group.

    When running REACH through its CLI, for each input file, it produces
    three output JSON files when using the fries output format. These three
    files jointly constitute the output, so they have to be combined to be
    processed. For instance, one might have PMC9582577.uaz.entities.json,
    PMC9582577.uaz.events.json, PMC9582577.uaz.sentence.json.

    Parameters
    ----------
    group_prefix : str
        The prefix for the group of output files, e.g., PMC9582577.uaz
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    file_types = ['entities', 'events', 'sentences']
    combined_json = {}
    for file_type in file_types:
        fname = '%s.%s.json' % (group_prefix, file_type)
        with open(fname, 'r') as fh:
            combined_json[file_type] = json.load(fh)
    # Note that we serialize back to a JSON string here to make use of the
    # replacements done in process_json_str below
    return process_json_str(json.dumps(combined_json), citation=citation,
                            organism_priority=organism_priority)


[docs]def process_json_str(json_str, citation=None, organism_priority=None):
    """Return a ReachProcessor by processing the given REACH json string.

    The output from the REACH parser is in this json format.
    For more information on the format, see: https://github.com/clulab/reach

    Parameters
    ----------
    json_str : str
        The json string to be processed.
    citation : Optional[str]
        A PubMed ID passed to be used in the evidence for the extracted INDRA
        Statements. Default: None
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    json_dict = _preprocess_json_str(json_str)
    if json_dict is None:
        return None
    rp = ReachProcessor(json_dict, pmid=citation,
                        organism_priority=organism_priority)
    rp.get_modifications()
    rp.get_complexes()
    rp.get_activation()
    rp.get_translocation()
    rp.get_regulate_amounts()
    rp.get_conversion()
    return rp


[docs]def process_agents_from_entities(file_name, organism_priority=None, with_coordinates=False):
    """Return INDRA Agents extracted from all entites, eve ones not appearing
    in Statements.

    Parameters
    ----------
    file_name : str
        The name of the json file to be processed.
    organism_priority : Optional[list of str]
        A list of Taxonomy IDs providing prioritization among organisms
        when choosing protein grounding. If not given, the default behavior
        takes the first match produced by Reach, which is prioritized to be
        a human protein if such a match exists.
    with_coordinates : Optional[bool]
        If True, the Agents will be returned in a tuple with their
        coordinates. Default: False

    Returns
    -------
    list[Agent] :
        A list of INDRA Agents processed from all extracted entities.
    """
    with open(file_name, 'rb') as fh:
        json_str = fh.read().decode('utf-8')
    json_dict = _preprocess_json_str(json_str)
    rp = ReachProcessor(json_dict, organism_priority=organism_priority)
    if with_coordinates:
        return rp.get_agents_from_entities_with_coords()
    else:
        return rp.get_agents_from_entities()


def _preprocess_json_str(json_str):
    fields = ['frame-id', 'argument-label', 'object-meta',
              'doc-id', 'is-hypothesis', 'is-negated',
              'is-direct', 'found-by']
    for field in fields:
        json_str = json_str.replace(field, field.replace('-', '_'))
    try:
        json_dict = json.loads(json_str)
    except ValueError as e:
        logger.error('Could not decode JSON string.')
        logger.exception(e)
        return None
    return json_dict


def _read_content_offline(content, content_type='text'):
    """Return a json string by processing the given text with offline
    REACH reader.

    Parameters
    ----------
    content : str
        The text to be processed.
    content_type : str
        Whether the content is a regular text or NXML.

    Returns
    -------
    json_str : bytes
        The json string produced by REACH reader.
    """
    if not try_offline:
        logger.error('Offline reading is not available.')
        return None
    try:
        api_ruler = reach_reader.get_api_ruler()
    except ReachOfflineReadingError as e:
        logger.error(e)
        logger.error('Cannot read offline because the REACH ApiRuler '
                     'could not be instantiated.')
        return None
    try:
        if content_type == 'text':
            result_map = api_ruler.annotateText(content, 'fries')
        elif content_type == 'nxml':
            result_map = api_ruler.annotateNxml(content, 'fries')
        else:
            raise ValueError('Invalid content_type: %s' % content_type)
    except JavaException as e:
        logger.error('Could not process %s.' % content_type)
        logger.error(e)
        return None
    # REACH version < 1.3.3
    json_str = result_map.get('resultJson')
    if not json_str:
        # REACH version >= 1.3.3
        json_str = result_map.get('result')
    if json_str is None:
        logger.warning('No results retrieved')
        return None
    if not isinstance(json_str, bytes):
        json_str = json_str.encode('utf-8')
    return json_str


def _read_text_service(text, url=reach_text_url, timeout=None):
    """Return a json string by processing the given text with online REACH API.

    Parameters
    ----------
    text : str
        The text to be processed.
    url : Optional[str]
        URL for REACH service. By default, Arizona REACH web service is called.
    timeout : Optional[float]
        Only wait for `timeout` seconds for the api to respond.

    Returns
    -------
    json_str : bytes
        The json string returned by REACH API.
    """
    params = {'text': text.encode('utf-8')}
    try:
        res = requests.post(url, params=params, timeout=timeout)
    except requests.exceptions.RequestException as e:
        logger.error('Could not connect to REACH service:')
        logger.error(e)
        return None
    # TODO: we could use res.json() here to get a dict
    # directly
    # This is a byte string
    json_str = res.content
    return json_str


def _read_nxml_file_service_old(nxml_file, url=reach_nxml_url):
    with open(nxml_file, 'r', encoding='utf8') as fh:
        nxml_str = fh.read()
    return _read_nxml_str_service_old(nxml_str, url=url)


def _read_nxml_str_service_old(nxml_str, url=reach_nxml_url):
    """Return a json string by processing the given NXML string with remote
    REACH webservice.

    Parameters
    ----------
    nxml_str : str
        The NXML string to be processed.
    url : Optional[str]
        URL for REACH service. By default, Arizona REACH web service is called.

    Returns
    -------
    json_str : bytes
        The json string returned by REACH API.
    """
    data = {'nxml': nxml_str}
    try:
        res = requests.post(url, data)
    except requests.exceptions.RequestException as e:
        logger.error('Could not connect to REACH service:')
        logger.error(e)
        return None
    if res.status_code != 200:
        logger.error('Could not process NXML via REACH service.'
                     + 'Status code: %d' % res.status_code)
        return None
    json_str = res.content
    return json_str


def _read_nxml_file_service_new(file_name, url=local_nxml_url):
    """Return a json string by processing the given NXML file with locally
    running instance of REACH webservice.

    Parameters
    ----------
    file_name : str
        The name of the NXML file to be processed.
    url : Optional[str]
        URL for REACH service. By default, localhost on port 8080 is called.

    Returns
    -------
    json_str : bytes
        The json string returned by REACH API.
    """
    with open(file_name, 'rb') as f:
        try:
            res = requests.post(url, files={'file': f})
        except requests.exceptions.RequestException as e:
            logger.error('Could not connect to REACH service:')
            logger.error(e)
            return None
    if res.status_code != 200:
        logger.error('Could not process NXML via REACH service.'
                     + 'Status code: %d' % res.status_code)
        return None
    json_str = res.content
    return json_str