Source code for indra.sources.tees.api

# -*- coding: utf-8 -*-
"""
This module provides a simplified API for invoking the Turku Event Extraction
System (TEES) on text and extracting INDRA statement from TEES output.

See publication:
Jari Björne, Sofie Van Landeghem, Sampo Pyysalo, Tomoko Ohta, Filip Ginter,
Yves Van de Peer, Sofia Ananiadou and Tapio Salakoski, PubMed-Scale Event
Extraction for Post-Translational Modifications, Epigenetics and Protein
Structural Relations. Proceedings of BioNLP 2012, pages 82-90, 2012.
"""

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
from indra.sources.tees.processor import TEESProcessor
from indra import get_config

import os.path
import logging
import codecs
import tempfile
import shutil
import subprocess
import glob
import gzip
import tarfile
import re

from indra.sources.tees.parse_tees import tees_parse_networkx_to_dot
import networkx.algorithms.dag as dag

__all__ = ['run_on_text', 'process_text', 'extract_output']

logger = logging.getLogger(__name__)

# If TEES isn't specified, we will check to see if any of these directories
# contain all of the files in tees_installation_files; if so, we'll assume
# that it is a TEES installation.
tees_candidate_paths = ['../TEES', '~/TEES', '~/Downloads/TEES']
tees_installation_files = ['batch.py', 'classify.py', 'train.py',
                           'visualize.py']
tees_installation_dirs = ['Classifiers', 'Detectors', 'Evaluators', 'Core']


[docs]def process_text(text, pmid=None, python2_path=None):
    """Processes the specified plain text with TEES and converts output to
    supported INDRA statements. Check for the TEES installation is the
    TEES_PATH environment variable, and configuration file; if not found,
    checks candidate paths in tees_candidate_paths. Raises an exception if
    TEES cannot be found in any of these places.

    Parameters
    ----------
    text : str
        Plain text to process with TEES
    pmid : str
        The PMID from which the paper comes from, to be stored in the Evidence
        object of statements. Set to None if this is unspecified.
    python2_path : str
        TEES is only compatible with python 2. This processor invokes this
        external python 2 interpreter so that the processor can be run in
        either python 2 or python 3. If None, searches for an executible named
        python2 in the PATH environment variable.

    Returns
    -------
    tp : TEESProcessor
        A TEESProcessor object which contains a list of INDRA statements
        extracted from TEES extractions
    """
    # Try to locate python2 in one of the directories of the PATH environment
    # variable if it is not provided
    if python2_path is None:
        for path in os.environ["PATH"].split(os.pathsep):
            proposed_python2_path = os.path.join(path, 'python2.7')
            if os.path.isfile(proposed_python2_path):
                python2_path = proposed_python2_path
                print('Found python 2 interpreter at', python2_path)
                break
    if python2_path is None:
        raise Exception('Could not find python2 in the directories ' +
                        'listed in the PATH environment variable. ' +
                        'Need python2 to run TEES.')

    # Run TEES
    a1_text, a2_text, sentence_segmentations = run_on_text(text,
                                                                python2_path)

    # Run the TEES processor
    tp = TEESProcessor(a1_text, a2_text, sentence_segmentations, pmid)
    return tp

[docs]def run_on_text(text, python2_path):
    """Runs TEES on the given text in a temporary directory and returns a
    temporary directory with TEES output.

    The caller should delete this directory when done with it. This function
    runs TEES and produces TEES output files but does not process TEES output
    into INDRA statements.

    Parameters
    ----------
    text : str
        Text from which to extract relationships
    python2_path : str
        The path to the python 2 interpreter

    Returns
    -------
    output_dir : str
        Temporary directory with TEES output. The caller should delete this
        directgory when done with it.
    """
    tees_path = get_config('TEES_PATH')

    if tees_path is None:
        # If TEES directory is not specifies, see if any of the candidate paths
        # exist and contain all of the files expected for a TEES installation.
        for cpath in tees_candidate_paths:
            cpath = os.path.expanduser(cpath)
            if os.path.isdir(cpath):
                # Check to see if it has all of the expected files and
                # directories
                has_expected_files = True
                for f in tees_installation_files:
                    fpath = os.path.join(cpath, f)
                    present = os.path.isfile(fpath)
                    has_expected_files = has_expected_files and present

                has_expected_dirs = True
                for d in tees_installation_dirs:
                    dpath = os.path.join(cpath, d)
                    present = os.path.isdir(dpath)
                    has_expected_dirs = has_expected_dirs and present

                if has_expected_files and has_expected_dirs:
                    # We found a directory with all of the files and
                    # directories  we expected in a TEES installation - let's
                    # assume it's a TEES installation
                    tees_path = cpath
                    print('Found TEES installation at ' + cpath)
                    break

    # Make sure the provided TEES directory exists
    if not os.path.isdir(tees_path):
        raise Exception('Provided TEES directory does not exist.')

    # Make sure the classify.py script exists within this directory
    classify_path = 'classify.py'
    # if not os.path.isfile(classify_path):
    #    raise Exception('classify.py does not exist in provided TEES path.')

    # Create a temporary directory to tag the shared-task files
    tmp_dir = tempfile.mkdtemp(suffix='indra_tees_processor')

    pwd = os.path.abspath(os.getcwd())

    try:
        # Write text to a file in the temporary directory
        text_path = os.path.join(tmp_dir, 'text.txt')
        # Had some trouble with non-ascii characters. A possible TODO item in
        # the future is to look into resolving this, for now just ignoring
        # non-latin-1 characters
        with codecs.open(text_path, 'w', encoding='latin-1', errors='ignore') \
                as f:
            f.write(text)

        # Run TEES
        output_path = os.path.join(tmp_dir, 'output')
        model_path = os.path.join(tees_path, 'tees_data/models/GE11-test/')
        command = [python2_path, classify_path, '-m', model_path,
                   '-i', text_path,
                   '-o', output_path]
        try:
            pwd = os.path.abspath(os.getcwd())
            os.chdir(tees_path)  # Change to TEES directory
            # print('cwd is:', os.getcwd())
            # out = subprocess.check_output(command, stderr=subprocess.STDOUT)
            p = subprocess.Popen(command, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE, cwd=tees_path)
            p.wait()
            (so, se) = p.communicate()
            print(so)
            print(se)
            os.chdir(pwd)  # Change back to previous directory
            # print('cwd is:', os.getcwd())
            # print(out.decode('utf-8'))

        except BaseException as e:
            # If there's an error, print it out and then propagate the
            # exception
            os.chdir(pwd)  # Change back to previous directory
            # print (e.output.decode('utf-8'))
            raise e

    except BaseException as e:
        # If there was an exception, delete the temporary directory and
        # pass on the exception
        shutil.rmtree(tmp_dir)
        raise e
    # Return the temporary directory with the TEES output
    output_tuple = extract_output(tmp_dir)
    shutil.rmtree(tmp_dir)
    return output_tuple


[docs]def extract_output(output_dir):
    """Extract the text of the a1, a2, and sentence segmentation files from the
    TEES output directory. These files are located within a compressed archive.

    Parameters
    ----------
    output_dir : str
        Directory containing the output of the TEES system

    Returns
    -------
    a1_text : str
        The text of the TEES a1 file (specifying the entities)
    a2_text : str
        The text of the TEES a2 file (specifying the event graph)
    sentence_segmentations : str
        The text of the XML file specifying the sentence segmentation
    """

    # Locate the file of sentences segmented by the TEES system, described
    # in a compressed xml document
    sentences_glob = os.path.join(output_dir, '*-preprocessed*.xml.gz')
    sentences_filename_candidates = glob.glob(sentences_glob)

    # Make sure there is exactly one such file
    if len(sentences_filename_candidates) != 1:
        m = 'Looking for exactly one file matching %s but found %d matches'
        raise Exception(m % (
            sentences_glob, len(sentences_filename_candidates)))
        return None, None, None

    # Read in the sentence segmentation XML
    sentence_segmentation_filename = sentences_filename_candidates[0]
    with gzip.GzipFile(sentences_filename_candidates[0], 'r') as f:
        sentence_segmentations = f.read().decode('utf-8')

    # Create a temporary directory to which to extract the a1 and a2 files from
    # the tarball
    tmp_dir = tempfile.mkdtemp(suffix='indra_tees_processor')

    try:
        # Make sure the tarfile with the extracted events is in shared task
        # format is in the output directory
        tarfile_glob = os.path.join(output_dir, '*-events.tar.gz')
        candidate_tarfiles = glob.glob(tarfile_glob)
        if len(candidate_tarfiles) != 1:
            raise Exception('Expected exactly one match for glob %s' %
                            tarfile_glob)
            return None, None, None

        # Decide what tar files to extract
        # (We're not blindly extracting all files because of the security
        # warning in the documentation for TarFile.extractall
        # In particular, we want to make sure that the filename doesn't
        # try to specify a relative or absolute path other than the current
        # directory by making sure the filename starts with an alphanumeric
        # character.
        # We're also only interested in files with the .a1 or .a2 extension
        tar_file = tarfile.open(candidate_tarfiles[0])
        a1_file = None
        a2_file = None
        extract_these = []
        for m in tar_file.getmembers():
            if re.match('[a-zA-Z0-9].*.a[12]', m.name):
                extract_these.append(m)

                if m.name.endswith('.a1'):
                    a1_file = m.name
                elif m.name.endswith('.a2'):
                    a2_file = m.name
                else:
                    assert(False)

        # There should be exactly two files that match these criteria
        if len(extract_these) != 2 or a1_file is None or a2_file is None:
            raise Exception('We thought there would be one .a1 and one .a2' +
                            ' file in the tarball, but we got %d files total' %
                            len(extract_these))
            return None, None, None

        # Extract the files that we decided to extract
        tar_file.extractall(path=tmp_dir, members=extract_these)

        # Read the text of the a1 (entities) file
        with codecs.open(os.path.join(tmp_dir, a1_file), 'r',
                         encoding='utf-8') as f:
            a1_text = f.read()

        # Read the text of the a2 (events) file
        with codecs.open(os.path.join(tmp_dir, a2_file), 'r',
                         encoding='utf-8') as f:
            a2_text = f.read()

        # Now that we're done, remove the temporary directory
        shutil.rmtree(tmp_dir)

        # Return the extracted text
        return a1_text, a2_text, sentence_segmentations
    except BaseException as e:
        # If there was an exception, delete the temporary directory and
        # pass on the exception
        print('Not removing temporary directory: ' + tmp_dir)
        shutil.rmtree(tmp_dir)
        raise e
        return None, None, None