Source code for indra.preassembler

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import sys
import time
import logging
import itertools
import functools
import collections
import multiprocessing as mp
from copy import copy, deepcopy
try:
    import pygraphviz as pgv
except ImportError:
    pass
from indra.statements import *
from indra.databases import uniprot_client


logger = logging.getLogger('preassembler')


[docs]class Preassembler(object): """De-duplicates statements and arranges them in a specificity hierarchy. Parameters ---------- hierarchies : dict[:py:class:`indra.preassembler.hierarchy_manager`] A dictionary of hierarchies with keys such as 'entity' (hierarchy of entities, primarily specifying relationships between genes and their families) and 'modification' pointing to HierarchyManagers stmts : list of :py:class:`indra.statements.Statement` or None A set of statements to perform pre-assembly on. If None, statements should be added using the :py:meth:`add_statements` method. Attributes ---------- stmts : list of :py:class:`indra.statements.Statement` Starting set of statements for preassembly. unique_stmts : list of :py:class:`indra.statements.Statement` Statements resulting from combining duplicates. related_stmts : list of :py:class:`indra.statements.Statement` Top-level statements after building the refinement hierarchy. hierarchies : dict[:py:class:`indra.preassembler.hierarchy_manager`] A dictionary of hierarchies with keys such as 'entity' and 'modification' pointing to HierarchyManagers """ def __init__(self, hierarchies, stmts=None): self.hierarchies = hierarchies if stmts: logger.debug("Deepcopying stmts in __init__") self.stmts = deepcopy(stmts) else: self.stmts = [] self.unique_stmts = None self.related_stmts = None
[docs] def add_statements(self, stmts): """Add to the current list of statements. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` Statements to add to the current list. """ self.stmts += deepcopy(stmts)
[docs] def combine_duplicates(self): """Combine duplicates among `stmts` and save result in `unique_stmts`. A wrapper around the static method :py:meth:`combine_duplicate_stmts`. """ if self.unique_stmts is None: self.unique_stmts = self.combine_duplicate_stmts(self.stmts) return self.unique_stmts
@staticmethod
[docs] def combine_duplicate_stmts(stmts): """Combine evidence from duplicate Statements. Statements are deemed to be duplicates if they have the same key returned by the `matches_key()` method of the Statement class. This generally means that statements must be identical in terms of their arguments and can differ only in their associated `Evidence` objects. This function keeps the first instance of each set of duplicate statements and merges the lists of Evidence from all of the other statements. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` Set of statements to de-duplicate. Returns ------- list of :py:class:`indra.statements.Statement` Unique statements with accumulated evidence across duplicates. Examples -------- De-duplicate and combine evidence for two statements differing only in their evidence lists: >>> map2k1 = Agent('MAP2K1') >>> mapk1 = Agent('MAPK1') >>> stmt1 = Phosphorylation(map2k1, mapk1, 'T', '185', ... evidence=[Evidence(text='evidence 1')]) >>> stmt2 = Phosphorylation(map2k1, mapk1, 'T', '185', ... evidence=[Evidence(text='evidence 2')]) >>> uniq_stmts = Preassembler.combine_duplicate_stmts([stmt1, stmt2]) >>> uniq_stmts [Phosphorylation(MAP2K1(), MAPK1(), T, 185)] >>> sorted([e.text for e in uniq_stmts[0].evidence]) # doctest:+IGNORE_UNICODE ['evidence 1', 'evidence 2'] """ unique_stmts = [] # Remove exact duplicates using a set() call, then make copies: st = list(set(stmts)) # Group statements according to whether they are matches (differing # only in their evidence). # Sort the statements in place by matches_key() st.sort(key=lambda x: x.matches_key()) for key, duplicates in itertools.groupby(st, key=lambda x: x.matches_key()): # Get the first statement and add the evidence of all subsequent # Statements to it for stmt_ix, stmt in enumerate(duplicates): if stmt_ix == 0: ev_keys = [ev.matches_key() for ev in stmt.evidence] first_stmt = stmt else: for ev in stmt.evidence: key = ev.matches_key() if key not in ev_keys: first_stmt.evidence.append(ev) ev_keys.append(key) # This should never be None or anything else assert isinstance(first_stmt, Statement) unique_stmts.append(first_stmt) return unique_stmts
def _set_supports_stmt_pairs(stmt_tuples, hierarchies=None, check_entities_match=False): ix_map = [] for stmt_tuple1, stmt_tuple2 in itertools.combinations(stmt_tuples, 2): stmt_ix1, stmt1 = stmt_tuple1 stmt_ix2, stmt2 = stmt_tuple2 if check_entities_match and not stmt1.entities_match(stmt2): continue if stmt1.refinement_of(stmt2, hierarchies): ix_map.append((stmt_ix1, stmt_ix2)) elif stmt2.refinement_of(stmt1, hierarchies): ix_map.append((stmt_ix2, stmt_ix1)) return ix_map
[docs]def render_stmt_graph(statements, agent_style=None): """Render the statement hierarchy as a pygraphviz graph. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` A list of top-level statements with associated supporting statements resulting from building a statement hierarchy with :py:meth:`combine_related`. agent_style : dict or None Dict of attributes specifying the visual properties of nodes. If None, the following default attributes are used:: agent_style = {'color': 'lightgray', 'style': 'filled', 'fontname': 'arial'} Returns ------- pygraphviz.AGraph Pygraphviz graph with nodes representing statements and edges pointing from supported statements to supported_by statements. Examples -------- Pattern for getting statements and rendering as a Graphviz graph: >>> from indra.preassembler.hierarchy_manager import hierarchies >>> braf = Agent('BRAF') >>> map2k1 = Agent('MAP2K1') >>> st1 = Phosphorylation(braf, map2k1) >>> st2 = Phosphorylation(braf, map2k1, residue='S') >>> pa = Preassembler(hierarchies, [st1, st2]) >>> pa.combine_related() # doctest:+ELLIPSIS [Phosphorylation(BRAF(), MAP2K1(), S)] >>> graph = render_stmt_graph(pa.related_stmts) >>> graph.write('example_graph.dot') # To make the DOT file >>> graph.draw('example_graph.png', prog='dot') # To make an image Resulting graph: .. image:: /images/example_graph.png :align: center :alt: Example statement graph rendered by Graphviz """ # Set the default agent formatting properties if agent_style is None: agent_style = {'color': 'lightgray', 'style': 'filled', 'fontname': 'arial'} # Sets to store all of the nodes and edges as we recursively process all # of the statements nodes = set([]) edges = set([]) # Recursive function for processing all statements def process_stmt(stmt): nodes.add(stmt) for sby_ix, sby_stmt in enumerate(stmt.supported_by): edges.add((str(stmt.matches_key()), str(sby_stmt.matches_key()))) process_stmt(sby_stmt) # Process all of the top-level statements, getting the supporting statements # recursively for stmt in statements: process_stmt(stmt) # Add the nodes and edges to the graph try: graph = pgv.AGraph(name='statements', directed=True, rankdir='LR') except NameError: logger.error('Cannot generate graph because ' 'pygraphviz could not be imported.') return None for node in nodes: graph.add_node(str(node.matches_key()), label=str(node), **agent_style) graph.add_edges_from(edges) return graph
[docs]def flatten_stmts(stmts): """Return the full set of unique stms in a pre-assembled stmt graph. The flattened list of of statements returned by this function can be compared to the original set of unique statements to make sure no statements have been lost during the preassembly process. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` A list of top-level statements with associated supporting statements resulting from building a statement hierarchy with :py:meth:`combine_related`. Returns ------- stmts : list of :py:class:`indra.statements.Statement` List of all statements contained in the hierarchical statement graph. Examples -------- Calling :py:meth:`combine_related` on two statements results in one top-level statement; calling :py:func:`flatten_stmts` recovers both: >>> from indra.preassembler.hierarchy_manager import hierarchies >>> braf = Agent('BRAF') >>> map2k1 = Agent('MAP2K1') >>> st1 = Phosphorylation(braf, map2k1) >>> st2 = Phosphorylation(braf, map2k1, residue='S') >>> pa = Preassembler(hierarchies, [st1, st2]) >>> pa.combine_related() # doctest:+ELLIPSIS [Phosphorylation(BRAF(), MAP2K1(), S)] >>> flattened = flatten_stmts(pa.related_stmts) >>> flattened.sort(key=lambda x: x.matches_key()) >>> flattened [Phosphorylation(BRAF(), MAP2K1()), Phosphorylation(BRAF(), MAP2K1(), S)] """ total_stmts = set(stmts) for stmt in stmts: if stmt.supported_by: children = flatten_stmts(stmt.supported_by) total_stmts = total_stmts.union(children) return list(total_stmts)
def _flatten_evidence_for_stmt(stmt): total_evidence = set(stmt.evidence) for supp_stmt in stmt.supported_by: child_evidence = _flatten_evidence_for_stmt(supp_stmt) total_evidence = total_evidence.union(child_evidence) return list(total_evidence)
[docs]def flatten_evidence(stmts): """Add evidence from *supporting* stmts to evidence for *supported* stmts. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` A list of top-level statements with associated supporting statements resulting from building a statement hierarchy with :py:meth:`combine_related`. Returns ------- stmts : list of :py:class:`indra.statements.Statement` Statement hierarchy identical to the one passed, but with the evidence lists for each statement now containing all of the evidence associated with the statements they are supported by. Examples -------- Flattening evidence adds the two pieces of evidence from the supporting statement to the evidence list of the top-level statement: >>> from indra.preassembler.hierarchy_manager import hierarchies >>> braf = Agent('BRAF') >>> map2k1 = Agent('MAP2K1') >>> st1 = Phosphorylation(braf, map2k1, ... evidence=[Evidence(text='foo'), Evidence(text='bar')]) >>> st2 = Phosphorylation(braf, map2k1, residue='S', ... evidence=[Evidence(text='baz'), Evidence(text='bak')]) >>> pa = Preassembler(hierarchies, [st1, st2]) >>> pa.combine_related() # doctest:+ELLIPSIS [Phosphorylation(BRAF(), MAP2K1(), S)] >>> [e.text for e in pa.related_stmts[0].evidence] # doctest:+IGNORE_UNICODE ['baz', 'bak'] >>> flattened = flatten_evidence(pa.related_stmts) >>> sorted([e.text for e in flattened[0].evidence]) # doctest:+IGNORE_UNICODE ['bak', 'bar', 'baz', 'foo'] """ # Copy all of the statements--these will be the ones where we update # the evidence lists copied_stmts = deepcopy(stmts) for stmt in stmts: total_evidence = _flatten_evidence_for_stmt(stmt) stmt.evidence = total_evidence return stmts