Source code for indra.assemblers.tsv.assembler

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import logging
from copy import copy
from indra.databases import get_identifiers_url
from indra.statements import *
from indra.util import write_unicode_csv


logger = logging.getLogger(__name__)


[docs]class TsvAssembler(object): """Assembles Statements into a set of tabular files for export or curation. Currently designed for use with "raw" Statements, i.e., Statements with a single evidence entry. Exports Statements into a single tab-separated file with the following columns: *INDEX* A 1-indexed integer identifying the statement. *UUID* The UUID of the Statement. *TYPE* Statement type, given by the name of the class in indra.statements. *STR* String representation of the Statement. Contains most relevant information for curation including any additional statement data beyond the Statement type and Agents. *AG_A_TEXT* For Statements extracted from text, the text in the sentence corresponding to the first agent (i.e., the 'TEXT' entry in the db_refs dictionary). For all other Statements, the Agent name is given. Empty field if the Agent is None. *AG_A_LINKS* Groundings for the first agent given as a comma-separated list of identifiers.org links. Empty if the Agent is None. *AG_A_STR* String representation of the first agent, including additional agent context (e.g. modification, mutation, location, and bound conditions). Empty if the Agent is None. *AG_B_TEXT, AG_B_LINKS, AG_B_STR* As above for the second agent. Note that the Agent may be None (and these fields left empty) if the Statement consists only of a single Agent (e.g., SelfModification, ActiveForm, or Translocation statement). *PMID* PMID of the first entry in the evidence list for the Statement. *TEXT* Evidence text for the Statement. *IS_HYP* Whether the Statement represents a "hypothesis", as flagged by some reading systems and recorded in the `evidence.epistemics['hypothesis']` field. *IS_DIRECT* Whether the Statement represents a direct physical interactions, as recorded by the `evidence.epistemics['direct']` field. In addition, if the `add_curation_cols` flag is set when calling :py:meth:`TsvAssembler.make_model`, the following additional (empty) columns will be added, to be filled out by curators: *AG_A_IDS_CORRECT* Correctness of Agent A grounding. *AG_A_STATE_CORRECT* Correctness of Agent A context (e.g., modification, bound, and other conditions). *AG_B_IDS_CORRECT, AG_B_STATE_CORRECT* As above, for Agent B. *EVENT_CORRECT* Whether the event is supported by the evidence text if the entities (Agents A and B) are considered as placeholders (i.e., ignoring the correctness of their grounding). *RES_CORRECT* For Modification statements, whether the amino acid residue indicated by the Statement is supported by the evidence. *POS_CORRECT* For Modification statements, whether the amino acid position indicated by the Statement is supported by the evidence. *SUBJ_ACT_CORRECT* For Activation/Inhibition Statements, whether the activity indicated for the subject (Agent A) is supported by the evidence. *OBJ_ACT_CORRECT* For Activation/Inhibition Statements, whether the activity indicated for the object (Agent B) is supported by the evidence. *HYP_CORRECT* Whether the Statement is correctly flagged as a hypothesis. *HYP_CORRECT* Whether the Statement is correctly flagged as direct. Parameters ---------- stmts : Optional[list[indra.statements.Statement]] A list of INDRA Statements to be assembled. Attributes ---------- statements : list[indra.statements.Statement] A list of INDRA Statements to be assembled. """ def __init__(self, statements=None): if not statements: self.statements = [] else: self.statements = statements def add_statements(self, stmts): self.statements.extend(stmts)
[docs] def make_model(self, output_file, add_curation_cols=False, up_only=False): """Export the statements into a tab-separated text file. Parameters ---------- output_file : str Name of the output file. add_curation_cols : bool Whether to add columns to facilitate statement curation. Default is False (no additional columns). up_only : bool Whether to include identifiers.org links *only* for the Uniprot grounding of an agent when one is available. Because most spreadsheets allow only a single hyperlink per cell, this can makes it easier to link to Uniprot information pages for curation purposes. Default is False. """ stmt_header = ['INDEX', 'UUID', 'TYPE', 'STR', 'AG_A_TEXT', 'AG_A_LINKS', 'AG_A_STR', 'AG_B_TEXT', 'AG_B_LINKS', 'AG_B_STR', 'PMID', 'TEXT', 'IS_HYP', 'IS_DIRECT'] if add_curation_cols: stmt_header = stmt_header + \ ['AG_A_IDS_CORRECT', 'AG_A_STATE_CORRECT', 'AG_B_IDS_CORRECT', 'AG_B_STATE_CORRECT', 'EVENT_CORRECT', 'RES_CORRECT', 'POS_CORRECT', 'SUBJ_ACT_CORRECT', 'OBJ_ACT_CORRECT', 'HYP_CORRECT', 'DIRECT_CORRECT'] rows = [stmt_header] for ix, stmt in enumerate(self.statements): # Complexes if len(stmt.agent_list()) > 2: logger.info("Skipping statement with more than two members: %s" % stmt) continue # Self-modifications, ActiveForms elif len(stmt.agent_list()) == 1: ag_a = stmt.agent_list()[0] ag_b = None # All others else: (ag_a, ag_b) = stmt.agent_list() # Put together the data row row = [ix+1, stmt.uuid, stmt.__class__.__name__, str(stmt)] + \ _format_agent_entries(ag_a, up_only) + \ _format_agent_entries(ag_b, up_only) + \ [stmt.evidence[0].pmid, stmt.evidence[0].text, stmt.evidence[0].epistemics.get('hypothesis', ''), stmt.evidence[0].epistemics.get('direct', '')] if add_curation_cols: row = row + ([''] * 11) rows.append(row) # Write to file write_unicode_csv(output_file, rows, delimiter='\t')
def _format_id(ns, id): """Format a namespace/ID pair for display and curation.""" label = '%s:%s' % (ns, id) label = label.replace(' ', '_') url = get_identifiers_url(ns, id) return (label, url) def _format_agent_entries(agent, up_only): if agent is None: return ['', '', ''] # Agent text/name agent_text = agent.db_refs.get('TEXT') if agent_text is None: agent_text = agent.name # Agent db_refs str db_refs = copy(agent.db_refs) if 'TEXT' in db_refs: db_refs.pop('TEXT') db_refs_str = ','.join(['%s|%s' % (k, v) for k, v in db_refs.items()]) # Agent links identifier_links = [] if up_only and 'UP' in db_refs: up_label, up_url = _format_id('UP', db_refs['UP']) identifier_links = [up_url] else: for ns, id in db_refs.items(): label, url = _format_id(ns, id) if url is None: identifier_links.append(label) else: identifier_links.append(url) links_str = ', '.join(identifier_links) return [agent_text, links_str, str(agent)]