Source code for indra.sources.isi.processor

import os
import json
import logging
from copy import deepcopy
import indra.statements as ist
from indra.preassembler.grounding_mapper.gilda import ground_statements

logger = logging.getLogger(__name__)



[docs]
class IsiProcessor(object):
    """Processes the output of the ISI reader.

    Parameters
    ----------
    reader_output : json
        The output JSON of the ISI reader as a json object.
    pmid : Optional[str]
        The PMID to assign to the extracted Statements
    extra_annotations : Optional[dict]
        Annotations to be included with each extracted Statement
    add_grounding : Optional[bool]
        If True, Gilda is used as a service to ground the Agents in
        the extracted Statements.

    Attributes
    ----------
    verbs : set[str]
        A list of verbs that have appeared in the processed ISI output
    statements : list[indra.statements.Statement]
        Extracted statements
    """
    def __init__(self, reader_output, pmid=None, extra_annotations=None,
                 add_grounding=False):
        self.reader_output = reader_output
        self.pmid = pmid
        self.extra_annotations = extra_annotations if \
            extra_annotations is not None else {}
        self.verbs = set()

        self.statements = []
        self.add_grounding = add_grounding


[docs]
    def get_statements(self):
        """Process reader output to produce INDRA Statements."""
        for k, v in self.reader_output.items():
            for interaction in v['interactions']:
                self._process_interaction(k, interaction, v['text'], self.pmid,
                                          self.extra_annotations)
        if self.add_grounding:
            ground_statements(self.statements)


    def _process_interaction(self, source_id, interaction, text, pmid,
                             extra_annotations):
        """Process an interaction JSON tuple from the ISI output, and adds up
        to one statement to the list of extracted statements.

        Parameters
        ----------
        source_id : str
            the JSON key corresponding to the sentence in the ISI output
            interaction: the JSON list with subject/verb/object information
            about the event in the ISI output
        text : str
            the text of the sentence
        pmid : str
            the PMID of the article from which the information was extracted
        extra_annotations : dict
            Additional annotations to add to the statement's evidence,
            potentially containing metadata about the source. Annotations
            with the key "interaction" will be overridden by the JSON
            interaction tuple from the ISI output
        """
        # Note: interaction[1] is a catalyst, but unused due to a lack of ways
        # to represent it with Statements.
        verb = interaction[0].lower()
        subj = interaction[-2]
        obj = interaction[-1]

        # Make ungrounded agent objects for the subject and object
        # Grounding will happen after all statements are extracted in __init__
        subj = self._make_agent(subj)
        obj = self._make_agent(obj)

        # Make an evidence object
        annotations = deepcopy(extra_annotations)
        if 'interaction' in extra_annotations:
            logger.warning("'interaction' key of extra_annotations ignored" +
                           " since this is reserved for storing the raw ISI " +
                           "input.")
        annotations['source_id'] = source_id
        annotations['interaction'] = interaction
        ev = ist.Evidence(source_api='isi',
                          pmid=pmid,
                          text=text.rstrip(),
                          annotations=annotations)

        # Add the verb to the set of verbs.
        self.verbs.add(verb)

        statement = None
        if verb in verb_to_statement_type:
            statement_class = verb_to_statement_type[verb]

            if statement_class == ist.Complex:
                statement = ist.Complex([subj, obj], evidence=ev)
            else:
                statement = statement_class(subj, obj, evidence=ev)

        if statement is not None:
            # For Complex statements, the ISI reader produces two events:
            # binds(A, B) and binds(B, A)
            # We want only one Complex statement for each sentence, so check
            # to see if we already have a Complex for this source_id with the
            # same members
            already_have = False
            if type(statement) == ist.Complex:
                for old_s in self.statements:
                    old_id = statement.evidence[0].source_id
                    new_id = old_s.evidence[0].source_id
                    if type(old_s) == ist.Complex and old_id == new_id:
                        old_statement_members = \
                                [m.db_refs['TEXT'] for m in old_s.members]
                        old_statement_members = sorted(old_statement_members)

                        new_statement_members = [m.db_refs['TEXT']
                                                 for m in statement.members]
                        new_statement_members = sorted(new_statement_members)

                        if old_statement_members == new_statement_members:
                            already_have = True
                            break

            if not already_have:
                self.statements.append(statement)

    @staticmethod
    def _make_agent(agent_str):
        """Makes an ungrounded Agent object from a string specifying an
        entity.

        Parameters
        ----------
        agent_str : str
            A string specifying the agent

        Returns
        -------
        agent : indra.statements.Agent
            An ungrounded Agent object referring to the specified text
        """
        return ist.Agent(agent_str, db_refs={'TEXT': agent_str})


[docs]
    def retain_molecular_complexes(self):
        """Filter the statements to Complexes between molecular entities."""
        self.statements = [s for s in self.statements
                           if isinstance(s, ist.Complex) and
                           all(is_molecular(m) for m in s.members)]




def is_molecular(agent):
    if agent is None:
        return False
    db, id = agent.get_grounding()
    return (db is not None and db in {'HGNC', 'UP', 'CHEBI', 'PUBCHEM',
                                      'UPPRO', 'FPLX'})


# Load the mapping between ISI verb and INDRA statement type
def _build_verb_statement_mapping():
    """Build the mapping between ISI verb strings and INDRA statement classes.

    Looks up the INDRA statement class name, if any, in a resource file,
    and resolves this class name to a class.

    Returns
    -------
    verb_to_statement_type : dict
        Dictionary mapping verb name to an INDRA statment class
    """
    path_this = os.path.dirname(os.path.abspath(__file__))
    map_path = os.path.join(path_this, 'isi_verb_to_indra_statement_type.tsv')
    with open(map_path, 'r') as f:
        first_line = True
        verb_to_statement_type = {}
        for line in f:
            if not first_line:
                line = line[:-1]
                tokens = line.split('\t')

                if len(tokens) == 2 and len(tokens[1]) > 0:
                    verb = tokens[0]
                    s_type = tokens[1]
                    try:
                        statement_class = getattr(ist, s_type)
                        verb_to_statement_type[verb] = statement_class
                    except Exception:
                        pass
            else:
                first_line = False
    return verb_to_statement_type


verb_to_statement_type = _build_verb_statement_mapping()