Source code for indra.assemblers.english_assembler

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import logging
import indra.statements as ist

logger = logging.getLogger('english_assembler')

[docs]class EnglishAssembler(object):
    """This assembler generates English sentences from INDRA Statements.

    Parameters
    ----------
    stmts : Optional[list[indra.statements.Statement]]
        A list of INDRA Statements to be added to the assembler.

    Attributes
    ----------
    statements : list[indra.statements.Statement]
        A list of INDRA Statements to assemble.
    model : str
        The assembled sentences as a single string.
    """
    def __init__(self, stmts=None):
        if stmts is None:
            self.statements = []
        else:
            self.statements = stmts
        self.model = None

[docs]    def add_statements(self, stmts):
        """Add INDRA Statements to the assembler's list of statements.

        Parameters
        ----------
        stmts : list[indra.statements.Statement]
            A list of :py:class:`indra.statements.Statement`
            to be added to the statement list of the assembler.
        """
        self.statements += stmts

[docs]    def make_model(self):
        """Assemble text from the set of collected INDRA Statements.

        Returns
        -------
        stmt_strs : str
            Return the assembled text as unicode string. By default, the text
            is a single string consisting of one or more sentences with
            periods at the end.
        """
        stmt_strs = []
        for stmt in self.statements:
            if isinstance(stmt, ist.Modification):
                stmt_strs.append(_assemble_modification(stmt))
            elif isinstance(stmt, ist.Autophosphorylation):
                stmt_strs.append(_assemble_autophosphorylation(stmt))
            elif isinstance(stmt, ist.Complex):
                stmt_strs.append(_assemble_complex(stmt))
            elif isinstance(stmt, ist.RegulateActivity):
                stmt_strs.append(_assemble_regulate_activity(stmt))
            elif isinstance(stmt, ist.RegulateAmount):
                stmt_strs.append(_assemble_regulate_amount(stmt))
            elif isinstance(stmt, ist.ActiveForm):
                stmt_strs.append(_assemble_activeform(stmt))
            elif isinstance(stmt, ist.Translocation):
                stmt_strs.append(_assemble_translocation(stmt))
            elif isinstance(stmt, ist.Gef):
                stmt_strs.append(_assemble_gef(stmt))
            elif isinstance(stmt, ist.Gap):
                stmt_strs.append(_assemble_gap(stmt))
            else:
                logger.warning('Unhandled statement type: %s.' % type(stmt))
        if stmt_strs:
            return ' '.join(stmt_strs)
        else:
            return ''

def _assemble_agent_str(agent):
    """Assemble an Agent object to text."""
    agent_str = agent.name

    # Handle mutation conditions
    if agent.mutations:
        mut_strs = []
        for mut in agent.mutations:
            res_to = mut.residue_to if mut.residue_to else ''
            res_from = mut.residue_from if mut.residue_from else ''
            pos = mut.position if mut.position else ''
            mut_str = '%s%s%s' % (res_from, pos, res_to)
            mut_strs.append(mut_str)
        mut_strs = '/'.join(mut_strs)
        agent_str = '%s-%s' % (agent_str, mut_strs)

    # Handle location
    if agent.location is not None:
        agent_str += ' in the ' + agent.location

    if not agent.mods and not agent.bound_conditions and not agent.activity:
        return agent_str

    # Handle bound conditions
    bound_to = [bc.agent.name for bc in
                agent.bound_conditions if bc.is_bound]
    not_bound_to = [bc.agent.name for bc in
                agent.bound_conditions if not bc.is_bound]
    if bound_to:
        agent_str += ' bound to ' + _join_list(bound_to)
        if not_bound_to:
            agent_str += ' and not bound to ' +\
                _join_list(not_bound_to)
    else:
        if not_bound_to:
            agent_str += ' not bound to ' +\
                _join_list(not_bound_to)

    # Handle modification conditions
    if agent.mods:
        # Special case
        if len(agent.mods) == 1 and agent.mods[0].position is None:
            prefix = _mod_state_str(agent.mods[0].mod_type)
            if agent.mods[0].residue is not None:
                residue_str =\
                    ist.amino_acids[agent.mods[0].residue]['full_name']
                prefix = residue_str + '-' + prefix
            agent_str =  prefix + ' ' + agent_str
        else:
            if agent.bound_conditions:
                agent_str += ' and'
            agent_str += ' %s on ' % _mod_state_str(agent.mods[0].mod_type)
            mod_lst = []
            for m in agent.mods:
                if m.position is None:
                    if m.residue is not None:
                        residue_str =\
                            ist.amino_acids[m.residue]['full_name']
                        mod_lst.append(residue_str)
                    else:
                        mod_lst.append('an unknown residue')
                else:
                    mod_lst.append(m.residue + m.position)
            agent_str += _join_list(mod_lst)


    # Handle activity conditions
    if agent.activity is not None:
        # TODO: handle activity types
        if agent.activity.is_active:
            prefix = 'active'
        else:
            prefix = 'inactive'
        agent_str = prefix + ' ' + agent_str

    return agent_str

def _join_list(lst):
    """Join a list of words in a gramatically correct way."""
    if len(lst) > 2:
        s = ', '.join(lst[:-1])
        s += ' and ' + lst[-1]
    elif len(lst) == 2:
        s = lst[0] + ' and ' + lst[1]
    elif len(lst) == 1:
        s = lst[0]
    else:
        s = ''
    return s

def _assemble_activeform(stmt):
    """Assemble ActiveForm statements into text."""
    subj_str = _assemble_agent_str(stmt.agent)
    if stmt.is_active:
        is_active_str = 'active'
    else:
        is_active_str = 'inactive'
    if stmt.activity == 'activity':
        stmt_str = subj_str + ' is ' + is_active_str
    elif stmt.activity == 'kinase':
        stmt_str = subj_str + ' is kinase-' + is_active_str
    elif stmt.activity == 'phosphatase':
        stmt_str = subj_str + ' is phosphatase-' + is_active_str
    elif stmt.activity == 'catalytic':
        stmt_str = subj_str + ' is catalytically ' + is_active_str
    elif stmt.activity == 'transcription':
        stmt_str = subj_str + ' is transcriptionally ' + is_active_str
    elif stmt.activity == 'gtpbound':
        stmt_str = subj_str + ' is GTP-bound ' + is_active_str
    return _make_sentence(stmt_str)

def _assemble_modification(stmt):
    """Assemble Modification statements into text."""
    sub_str = _assemble_agent_str(stmt.sub)
    if stmt.enz is not None:
        enz_str = _assemble_agent_str(stmt.enz)
        if _get_is_direct(stmt):
            mod_str = ' ' + _mod_process_verb(stmt) + ' '
        else:
            mod_str = ' leads to the ' + _mod_process_noun(stmt) + ' of '
        stmt_str = enz_str + mod_str + sub_str
    else:
        stmt_str = sub_str + ' is ' + _mod_state_stmt(stmt)

    if stmt.residue is not None:
        if stmt.position is None:
            mod_str = 'on ' + ist.amino_acids[stmt.residue]['full_name']
        else:
            mod_str = 'on ' + stmt.residue + stmt.position
    else:
        mod_str = ''
    stmt_str += ' ' + mod_str
    return _make_sentence(stmt_str)

def _assemble_complex(stmt):
    """Assemble Complex statements into text."""
    member_strs = [_assemble_agent_str(m) for m in stmt.members]
    stmt_str = member_strs[0] + ' binds ' + _join_list(member_strs[1:])
    return _make_sentence(stmt_str)

def _assemble_autophosphorylation(stmt):
    """Assemble Autophosphorylation statements into text."""
    enz_str = _assemble_agent_str(stmt.enz)
    stmt_str = enz_str + ' phosphorylates itself'
    if stmt.residue is not None:
        if stmt.position is None:
            mod_str = 'on ' + ist.amino_acids[stmt.residue]['full_name']
        else:
            mod_str = 'on ' + stmt.residue + stmt.position
    else:
        mod_str = ''
    stmt_str += ' ' + mod_str
    return _make_sentence(stmt_str)

def _assemble_regulate_activity(stmt):
    """Assemble RegulateActivity statements into text."""
    subj_str = _assemble_agent_str(stmt.subj)
    obj_str = _assemble_agent_str(stmt.obj)
    if stmt.is_activation:
        rel_str = ' activates '
    else:
        rel_str = ' inhibits '
    stmt_str = subj_str + rel_str + obj_str
    return _make_sentence(stmt_str)

def _assemble_regulate_amount(stmt):
    """Assemble RegulateAmount statements into text."""
    obj_str = _assemble_agent_str(stmt.obj)
    if stmt.subj is not None:
        subj_str = _assemble_agent_str(stmt.subj)
        if isinstance(stmt, ist.IncreaseAmount):
            rel_str = ' increases the amount of '
        elif isinstance(stmt, ist.DecreaseAmount):
            rel_str = ' decreases the amount of '
        stmt_str = subj_str + rel_str + obj_str
    else:
        if isinstance(stmt, ist.IncreaseAmount):
            stmt_str = obj_str + ' is produced'
        elif isinstance(stmt, ist.DecreaseAmount):
            stmt_str = obj_str + ' is degraded'
    return _make_sentence(stmt_str)

def _assemble_translocation(stmt):
    """Assemble Translocation statements into text."""
    agent_str = _assemble_agent_str(stmt.agent)
    stmt_str = agent_str + ' translocates'
    if stmt.from_location is not None:
        stmt_str += ' from the ' + stmt.from_location
    if stmt.to_location is not None:
        stmt_str += ' to the ' + stmt.to_location
    return _make_sentence(stmt_str)

def _assemble_gap(stmt):
    """Assemble Gap statements into text."""
    subj_str = _assemble_agent_str(stmt.gap)
    obj_str = _assemble_agent_str(stmt.ras)
    stmt_str = subj_str + ' is a GAP for ' + obj_str
    return _make_sentence(stmt_str)

def _assemble_gef(stmt):
    """Assemble Gef statements into text."""
    subj_str = _assemble_agent_str(stmt.gef)
    obj_str = _assemble_agent_str(stmt.ras)
    stmt_str = subj_str + ' is a GEF for ' + obj_str
    return _make_sentence(stmt_str)

def _make_sentence(txt):
    """Make a sentence from a piece of text."""
    #Make sure first letter is capitalized
    txt = txt.strip(' ')
    txt = txt[0].upper() + txt[1:] + '.'
    return txt

def _get_is_direct(stmt):
    '''Returns true if there is evidence that the statement is a direct
    interaction. If any of the evidences associated with the statement
    indicates a direct interatcion then we assume the interaction
    is direct. If there is no evidence for the interaction being indirect
    then we default to direct.'''
    any_indirect = False
    for ev in stmt.evidence:
        if ev.epistemics.get('direct') is True:
            return True
        elif ev.epistemics.get('direct') is False:
            # This guarantees that we have seen at least
            # some evidence that the statement is indirect
            any_indirect = True
    if any_indirect:
        return False
    return True

def _get_is_hypothesis(stmt):
    '''Returns true if there is evidence that the statement is only
    hypothetical. If all of the evidences associated with the statement
    indicate a hypothetical interaction then we assume the interaction
    is hypothetical.'''
    for ev in stmt.evidence:
        if not ev.epistemics.get('hypothesis') is True:
            return True
    return False

def _get_is_hypothesis_adverb(stmt):
    '''Returns the string associated with a statement being hypothetical.'''
    if _get_is_hypothesis(stmt):
        return ' hypothetically '
    else:
        return ''

def _mod_process_verb(stmt):
    mod_name = stmt.__class__.__name__.lower()
    return mod_process_prefix.get(mod_name)

def _mod_process_noun(stmt):
    mod_name = stmt.__class__.__name__.lower()
    return mod_name

def _mod_state_stmt(stmt):
    mod_name = stmt.__class__.__name__.lower()
    return mod_state_prefix.get(mod_name)

def _mod_state_str(s):
    return mod_state_prefix.get(s)

mod_state_prefix = {
    'phosphorylation': 'phosphorylated',
    'dephosphorylation': 'dephosphorylated',
    'ubiquitination': 'ubiquitinated',
    'deubiquitination': 'deubiquitinated',
    'acetylation': 'acetylated',
    'deacetylation': 'deacetylated',
    'hydroxylation': 'hydroxylated',
    'dehydroxylation': 'dehydroxylated',
    'sumoylation': 'sumoylated',
    'desumoylation': 'desumoylated',
    'farnesylation': 'farnesylated',
    'defarnesylation': 'defarnesylated',
    'glycosylation': 'glycosylated',
    'deglycosylation': 'deglycosylated',
    'ribosylation': 'ribosylated',
    'deribosylation': 'deribosylated'
}

mod_process_prefix = {
    'phosphorylation': 'phosphorylates',
    'dephosphorylation': 'dephosphorylates',
    'ubiquitination': 'ubiquitinates',
    'deubiquitination': 'deubiquitinates',
    'acetylation': 'acetylates',
    'deacetylation': 'deacetylates',
    'hydroxylation': 'hydroxylates',
    'dehydroxylation': 'dehydroxylates',
    'sumoylation': 'sumoylates',
    'desumoylation': 'desumoylates',
    'farnesylation': 'farnesylates',
    'defarnesylation': 'defarnesylates',
    'glycosylation': 'glycosylates',
    'deglycosylation': 'deglycosylates',
    'ribosylation': 'ribosylates',
    'deribosylation': 'deribosylates'
    }