Source code for indra.assemblers.tsv.assembler

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import logging
from copy import copy
from indra.databases import get_identifiers_url
from indra.statements import *
from indra.util import write_unicode_csv


logger = logging.getLogger(__name__)


[docs]class TsvAssembler(object):
    """Assembles Statements into a set of tabular files for export or curation.

    Currently designed for use with "raw" Statements, i.e., Statements with a
    single evidence entry. Exports Statements into a single tab-separated file
    with the following columns:

    *INDEX*
        A 1-indexed integer identifying the statement.
    *UUID*
        The UUID of the Statement.
    *TYPE*
        Statement type, given by the name of the class in indra.statements.
    *STR*
        String representation of the Statement. Contains most relevant
        information for curation including any additional statement data
        beyond the Statement type and Agents.
    *AG_A_TEXT*
        For Statements extracted from text, the text in the sentence
        corresponding to the first agent (i.e., the 'TEXT' entry in the
        db_refs dictionary). For all other Statements, the Agent name is
        given. Empty field if the Agent is None.
    *AG_A_LINKS*
        Groundings for the first agent given as a comma-separated list of
        identifiers.org links. Empty if the Agent is None.
    *AG_A_STR*
        String representation of the first agent, including additional
        agent context (e.g. modification, mutation, location, and bound
        conditions). Empty if the Agent is None.
    *AG_B_TEXT, AG_B_LINKS, AG_B_STR*
        As above for the second agent. Note that the Agent may be None (and
        these fields left empty) if the Statement consists only of a single
        Agent (e.g., SelfModification, ActiveForm, or Translocation statement).
    *PMID*
        PMID of the first entry in the evidence list for the Statement.
    *TEXT*
        Evidence text for the Statement.
    *IS_HYP*
        Whether the Statement represents a "hypothesis", as flagged by some
        reading systems and recorded in the `evidence.epistemics['hypothesis']`
        field.
    *IS_DIRECT*
        Whether the Statement represents a direct physical interactions,
        as recorded by the `evidence.epistemics['direct']` field.

    In addition, if the `add_curation_cols` flag is set when calling
    :py:meth:`TsvAssembler.make_model`, the following additional (empty)
    columns will be added, to be filled out by curators:

    *AG_A_IDS_CORRECT*
        Correctness of Agent A grounding.
    *AG_A_STATE_CORRECT*
        Correctness of Agent A context (e.g., modification, bound, and other
        conditions).
    *AG_B_IDS_CORRECT, AG_B_STATE_CORRECT*
        As above, for Agent B.
    *EVENT_CORRECT*
        Whether the event is supported by the evidence text if the entities
        (Agents A and B) are considered as placeholders (i.e.,
        ignoring the correctness of their grounding).
    *RES_CORRECT*
        For Modification statements, whether the amino acid residue indicated
        by the Statement is supported by the evidence.
    *POS_CORRECT*
        For Modification statements, whether the amino acid position indicated
        by the Statement is supported by the evidence.
    *SUBJ_ACT_CORRECT*
        For Activation/Inhibition Statements, whether the activity indicated
        for the subject (Agent A) is supported by the evidence.
    *OBJ_ACT_CORRECT*
        For Activation/Inhibition Statements, whether the activity indicated
        for the object (Agent B) is supported by the evidence.
    *HYP_CORRECT*
        Whether the Statement is correctly flagged as a hypothesis.
    *HYP_CORRECT*
        Whether the Statement is correctly flagged as direct.

    Parameters
    ----------
    stmts : Optional[list[indra.statements.Statement]]
        A list of INDRA Statements to be assembled.

    Attributes
    ----------
    statements : list[indra.statements.Statement]
        A list of INDRA Statements to be assembled.
    """
    def __init__(self, statements=None):
        if not statements:
            self.statements = []
        else:
            self.statements = statements

    def add_statements(self, stmts):
        self.statements.extend(stmts)

[docs]    def make_model(self, output_file, add_curation_cols=False, up_only=False):
        """Export the statements into a tab-separated text file.

        Parameters
        ----------
        output_file : str
            Name of the output file.
        add_curation_cols : bool
            Whether to add columns to facilitate statement curation. Default
            is False (no additional columns).
        up_only : bool
            Whether to include identifiers.org links *only* for the Uniprot
            grounding of an agent when one is available. Because most
            spreadsheets allow only a single hyperlink per cell, this can makes
            it easier to link to Uniprot information pages for curation
            purposes. Default is False.
        """
        stmt_header = ['INDEX', 'UUID', 'TYPE', 'STR',
                       'AG_A_TEXT', 'AG_A_LINKS', 'AG_A_STR',
                       'AG_B_TEXT', 'AG_B_LINKS', 'AG_B_STR',
                       'PMID', 'TEXT', 'IS_HYP', 'IS_DIRECT']
        if add_curation_cols:
            stmt_header = stmt_header + \
                          ['AG_A_IDS_CORRECT', 'AG_A_STATE_CORRECT',
                           'AG_B_IDS_CORRECT', 'AG_B_STATE_CORRECT',
                           'EVENT_CORRECT',
                           'RES_CORRECT', 'POS_CORRECT', 'SUBJ_ACT_CORRECT',
                           'OBJ_ACT_CORRECT', 'HYP_CORRECT', 'DIRECT_CORRECT']
        rows = [stmt_header]

        for ix, stmt in enumerate(self.statements):
            # Complexes
            if len(stmt.agent_list()) > 2:
                logger.info("Skipping statement with more than two members: %s"
                            % stmt)
                continue
            # Self-modifications, ActiveForms
            elif len(stmt.agent_list()) == 1:
                ag_a = stmt.agent_list()[0]
                ag_b = None
            # All others
            else:
                (ag_a, ag_b) = stmt.agent_list()
            # Put together the data row
            row = [ix+1, stmt.uuid, stmt.__class__.__name__, str(stmt)] + \
                  _format_agent_entries(ag_a, up_only) + \
                  _format_agent_entries(ag_b, up_only) + \
                  [stmt.evidence[0].pmid, stmt.evidence[0].text,
                   stmt.evidence[0].epistemics.get('hypothesis', ''),
                   stmt.evidence[0].epistemics.get('direct', '')]
            if add_curation_cols:
                row = row + ([''] * 11)
            rows.append(row)
        # Write to file
        write_unicode_csv(output_file, rows, delimiter='\t')


def _format_id(ns, id):
    """Format a namespace/ID pair for display and curation."""
    label = '%s:%s' % (ns, id)
    label = label.replace(' ', '_')
    url = get_identifiers_url(ns, id)
    return (label, url)


def _format_agent_entries(agent, up_only):
    if agent is None:
        return ['', '', '']
    # Agent text/name
    agent_text = agent.db_refs.get('TEXT')
    if agent_text is None:
        agent_text = agent.name
    # Agent db_refs str
    db_refs = copy(agent.db_refs)
    if 'TEXT' in db_refs:
        db_refs.pop('TEXT')
    db_refs_str = ','.join(['%s|%s' % (k, v)
                            for k, v in db_refs.items()])
    # Agent links
    identifier_links = []
    if up_only and 'UP' in db_refs:
        up_label, up_url = _format_id('UP', db_refs['UP'])
        identifier_links = [up_url]
    else:
        for ns, id in db_refs.items():
            label, url = _format_id(ns, id)
            if url is None:
                identifier_links.append(label)
            else:
                identifier_links.append(url)
    links_str = ', '.join(identifier_links)
    return [agent_text, links_str, str(agent)]