Source code for indra.sources.isi.api

from datetime import datetime

__all__ = ['process_text', 'process_nxml', 'process_preprocessed',
           'process_json_file', 'process_output_folder']

import os
import glob
import json
import shutil
import logging
import tempfile
import subprocess

from indra.sources.isi.processor import IsiProcessor
from indra.sources.isi.preprocessor import IsiPreprocessor

logger = logging.getLogger(__name__)


DOCKER_IMAGE_NAME = 'sahilgar/bigmechisi'
IN_ISI_DOCKER = os.environ.get('IN_ISI_DOCKER', 'false').lower() == 'true'


class IsiRuntimeError(Exception):
    pass


[docs]def process_text(text, pmid=None, **kwargs):
    """Process a string using the ISI reader and extract INDRA statements.

    Parameters
    ----------
    text : str
        A text string to process
    pmid : Optional[str]
        The PMID associated with this text (or None if not specified)
    num_processes : Optional[int]
        Number of processes to parallelize over
    cleanup : Optional[bool]
        If True, the temporary folders created for preprocessed reading input
        and output are removed. Default: True
    add_grounding : Optional[bool]
        If True the extracted Statements' grounding is mapped
    molecular_complexes_only : Optional[bool]
        If True, only Complex statements between molecular entities are retained
        after grounding.

    Returns
    -------
    ip : indra.sources.isi.processor.IsiProcessor
        A processor containing statements
    """
    cleanup = kwargs.get('cleanup', True)

    # Create a temporary directory to store the proprocessed input
    pp_dir = tempfile.mkdtemp('indra_isi_pp_output')

    pp = IsiPreprocessor(pp_dir)
    extra_annotations = {}
    pp.preprocess_plain_text_string(text, pmid, extra_annotations)

    # Run the ISI reader and extract statements
    ip = process_preprocessed(pp, **kwargs)

    if cleanup:
        # Remove temporary directory with processed input
        shutil.rmtree(pp_dir)
    else:
        logger.info('Not cleaning up %s' % pp_dir)

    return ip


[docs]def process_nxml(nxml_filename, pmid=None, extra_annotations=None, **kwargs):
    """Process an NXML file using the ISI reader

    First converts NXML to plain text and preprocesses it, then runs the ISI
    reader, and processes the output to extract INDRA Statements.

    Parameters
    ----------
    nxml_filename : str
        nxml file to process
    pmid : Optional[str]
        pmid of this nxml file, to be added to the Evidence object of the
        extracted INDRA statements
    extra_annotations : Optional[dict]
        Additional annotations to add to the Evidence object of all extracted
        INDRA statements. Extra annotations called 'interaction' are ignored
        since this is used by the processor to store the corresponding
        raw ISI output.
    num_processes : Optional[int]
        Number of processes to parallelize over
    cleanup : Optional[bool]
        If True, the temporary folders created for preprocessed reading input
        and output are removed. Default: True
    add_grounding : Optional[bool]
        If True the extracted Statements' grounding is mapped
    molecular_complexes_only : Optional[bool]
        If True, only Complex statements between molecular entities are retained
        after grounding.

    Returns
    -------
    ip : indra.sources.isi.processor.IsiProcessor
        A processor containing extracted Statements
    """
    if extra_annotations is None:
        extra_annotations = {}

    cleanup = kwargs.get('cleanup', True)

    # Create a temporary directory to store the proprocessed input
    pp_dir = tempfile.mkdtemp('indra_isi_pp_output')

    pp = IsiPreprocessor(pp_dir)
    pp.preprocess_nxml_file(nxml_filename, pmid, extra_annotations)

    # Run the ISI reader and extract statements
    ip = process_preprocessed(pp, **kwargs)

    if cleanup:
        # Remove temporary directory with processed input
        shutil.rmtree(pp_dir)
    else:
        logger.info('Not cleaning up %s' % pp_dir)

    return ip


def _make_links(dirname, link_dir):
    """Make links to files in a directory.

    This is used when running from within the modified ISI docker.
    """
    # Create a directory in the root dir with the appropriate name.
    if not os.path.exists(link_dir):
        os.mkdir(link_dir)

    # Link each of the files in the directory to this new link_dir.
    for fname in os.listdir(dirname):
        if fname.startswith('.'):
            continue
        link = os.path.join(link_dir, fname)
        true = os.path.join(dirname, fname)
        os.symlink(true, link)

    return


def run_isi(input_dir, output_dir, tmp_dir, num_processes=1,
            verbose=True, log=False):
    base_command = ['/root/myprocesspapers.sh', '-c', str(num_processes)]

    if IN_ISI_DOCKER:
        _make_links(input_dir, '/input')
        os.mkdir('/output')
        os.mkdir('/temp')
        command = base_command
    else:
        # We call realpath on all these paths so that any symbolic links
        # are generated out - this is needed on Mac
        input_binding = os.path.realpath(input_dir) + ':/input:ro'
        output_binding = os.path.realpath(output_dir) + ':/output:rw'
        tmp_binding = os.path.realpath(tmp_dir) + ':/temp:rw'
        command = ['docker', 'run', '-it', '--rm',
                   '-v', input_binding, '-v', output_binding, '-v', tmp_binding,
                   DOCKER_IMAGE_NAME] + base_command

    # Invoke the ISI reader
    logger.info('Running command from within the docker:' if IN_ISI_DOCKER
                else 'Running command using the docker:')
    logger.info(' '.join(command))

    p = subprocess.Popen(command, stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)

    # Monitor the logs and wait for reading to end.
    log_file_str = ''
    for line in iter(p.stdout.readline, b''):
        log_line = 'ISI: ' + line.strip().decode('utf8')
        if verbose:
            logger.info(log_line)
        if log:
            log_file_str += log_line + '\n'

    if log:
        with open('isi_run.log', 'ab') as f:
            f.write(log_file_str.encode('utf8'))

    p_out, p_err = p.communicate()
    if p.returncode:
        logger.error('Problem running ISI:')
        logger.error('Stdout & Stderr: %s' % p_out.decode('utf-8'))
        raise IsiRuntimeError("Problem encountered running ISI.")

    logger.info("ISI finished.")

    if IN_ISI_DOCKER:
        _make_links('/output', output_dir)
        _make_links('/temp', tmp_dir)

    return


def get_isi_image_data():
    """Get the json data for the ISI docker image."""
    if IN_ISI_DOCKER:
        logger.error("Cannot read docker info from within the docker.")
        return {}

    ret = subprocess.run(['docker', 'image', 'inspect', DOCKER_IMAGE_NAME],
                         stdout=subprocess.PIPE)
    image_data = json.loads(ret.stdout)[0]
    return image_data


def get_isi_version():
    if IN_ISI_DOCKER:
        timestamp = os.path.getmtime('/root/myprocesspapers.sh')
        dt = datetime.fromtimestamp(timestamp)
    else:
        data = get_isi_image_data()
        dt = datetime.strptime(data['Created'].split('.')[0],
                               '%Y-%m-%dT%H:%M:%S')
    return dt.strftime('%Y%m%d')


[docs]def process_preprocessed(isi_preprocessor, num_processes=1,
                         output_dir=None, cleanup=True, add_grounding=True,
                         molecular_complexes_only=False):
    """Process a directory of abstracts and/or papers preprocessed using the
    specified IsiPreprocessor, to produce a list of extracted INDRA statements.

    Parameters
    ----------
    isi_preprocessor : indra.sources.isi.preprocessor.IsiPreprocessor
        Preprocessor object that has already preprocessed the documents we
        want to read and process with the ISI reader
    num_processes : Optional[int]
        Number of processes to parallelize over
    output_dir : Optional[str]
        The directory into which to put reader output; if omitted or None,
        uses a temporary directory.
    cleanup : Optional[bool]
        If True, the temporary folders created for preprocessed reading input
        and output are removed. Default: True
    add_grounding : Optional[bool]
        If True the extracted Statements' grounding is mapped
    molecular_complexes_only : Optional[bool]
        If True, only Complex statements between molecular entities are retained
        after grounding.

    Returns
    -------
    ip : indra.sources.isi.processor.IsiProcessor
        A processor containing extracted statements
    """

    # Create a temporary directory to store the output
    if output_dir is None:
        output_dir = tempfile.mkdtemp('indra_isi_processor_output')
    else:
        output_dir = os.path.abspath(output_dir)
    tmp_dir = tempfile.mkdtemp('indra_isi_processor_tmp')

    # Form the command to invoke the ISI reader via Docker
    dir_name = isi_preprocessor.preprocessed_dir

    # Run the ISI reader
    run_isi(dir_name, output_dir, tmp_dir, num_processes)

    ips = []
    for fname, pmid, extra_annots in isi_preprocessor.iter_outputs(output_dir):
        ip = process_json_file(fname, pmid=pmid,
                               extra_annotations=extra_annots,
                               add_grounding=add_grounding,
                            molecular_complexes_only=molecular_complexes_only)
        ips.append(ip)

    # Remove the temporary output directory
    if output_dir is None:
        if cleanup:
            shutil.rmtree(output_dir)
        else:
            logger.info('Not cleaning up %s' % output_dir)
    if cleanup:
        shutil.rmtree(tmp_dir)
    else:
        logger.info('Not cleaning up %s' % output_dir)

    if len(ips) > 1:
        for ip in ips[1:]:
            ips[0].statements += ip.statements

    if ips:
        return ips[0]
    else:
        return None


[docs]def process_output_folder(folder_path, pmids=None, extra_annotations=None,
                          add_grounding=True, molecular_complexes_only=False):
    """Recursively extracts statements from all ISI output files in the
    given directory and subdirectories.

    Parameters
    ----------
    folder_path : str
        The directory to traverse
    pmids : Optional[str]
        PMID mapping to be added to the Evidence of the extracted INDRA
        Statements
    extra_annotations : Optional[dict]
        Additional annotations to add to the Evidence object of all extracted
        INDRA statements. Extra annotations called 'interaction' are ignored
        since this is used by the processor to store the corresponding
        raw ISI output.
    add_grounding : Optional[bool]
        If True the extracted Statements' grounding is mapped
    molecular_complexes_only : Optional[bool]
        If True, only Complex statements between molecular entities are retained
        after grounding.
    """
    pmids = pmids if pmids is not None else {}
    extra_annotations = extra_annotations if \
        extra_annotations is not None else {}
    ips = []
    for entry in glob.glob(os.path.join(folder_path, '*.json')):
        entry_key = os.path.splitext(os.path.basename(entry))[0]
        # Extract the corresponding file id
        pmid = pmids.get(entry_key)
        extra_annotation = extra_annotations.get(entry_key)
        ip = process_json_file(entry, pmid, extra_annotation,
                               add_grounding=add_grounding,
                            molecular_complexes_only=molecular_complexes_only)
        ips.append(ip)

    if len(ips) > 1:
        for ip in ips[1:]:
            ips[0].statements += ip.statements

    if ips:
        return ips[0]
    else:
        return None


[docs]def process_json_file(file_path, pmid=None, extra_annotations=None,
                      add_grounding=True, molecular_complexes_only=False):
    """Extracts statements from the given ISI output file.

    Parameters
    ----------
    file_path : str
        The ISI output file from which to extract statements
    pmid : int
        The PMID of the document being preprocessed, or None if not
        specified
    extra_annotations : dict
        Extra annotations to be added to each statement from this document
        (can be the empty dictionary)
    add_grounding : Optional[bool]
        If True the extracted Statements' grounding is mapped
    molecular_complexes_only : Optional[bool]
        If True, only Complex statements between molecular entities are retained
        after grounding.
    """
    logger.info('Extracting from %s' % file_path)
    with open(file_path, 'rb') as fh:
        jd = json.load(fh)
        ip = IsiProcessor(jd, pmid, extra_annotations,
                          add_grounding=add_grounding)
        ip.get_statements()
        if molecular_complexes_only:
            ip.retain_molecular_complexes()
        return ip