Source code for indra.assemblers.cyjs.assembler

import json
import logging
import itertools
import collections
import numpy as np
from copy import deepcopy
from indra.statements import *
from indra.databases import context_client, get_identifiers_url
from indra.tools.expand_families import Expander
from indra.ontology.bio import bio_ontology
from indra.ontology.standardize import \
    standardize_db_refs

expander = Expander(ontology=bio_ontology)


logger = logging.getLogger(__name__)


[docs]class CyJSAssembler(object):
    """This class assembles a CytoscapeJS graph from a set of INDRA Statements.

    CytoscapeJS is a web-based network library for analysis and
    visualisation: http://js.cytoscape.org/

    Parameters
    ----------
    statements : Optional[list[indra.statements.Statement]]
        A list of INDRA Statements to be assembled.

    Attributes
    ----------
    statements : list[indra.statements.Statement]
        A list of INDRA Statements to be assembled.
    """
    def __init__(self, stmts=None):
        if not stmts:
            self.statements = []
        else:
            self.statements = stmts
        self._edges = []
        self._nodes = []
        self._existing_nodes = {}
        self._id_counter = 0
        self._exp_colorscale = []
        self._mut_colorscale = []
        self._gene_names = []
        self._context = {}

[docs]    def add_statements(self, stmts):
        """Add INDRA Statements to the assembler's list of statements.

        Parameters
        ----------
        stmts : list[indra.statements.Statement]
            A list of :py:class:`indra.statements.Statement`
            to be added to the statement list of the assembler.
        """
        for stmt in stmts:
            self.statements.append(stmt)

[docs]    def make_model(self, *args, **kwargs):
        """Assemble a Cytoscape JS network from INDRA Statements.

        This method assembles a Cytoscape JS network from the set of INDRA
        Statements added to the assembler.

        Parameters
        ----------
        grouping : bool
            If True, the nodes with identical incoming and outgoing edges
            are grouped and the corresponding edges are merged.

        Returns
        -------
        cyjs_str : str
            The json serialized Cytoscape JS model.
        """
        for stmt in self.statements:
            if isinstance(stmt, RegulateActivity):
                self._add_regulate_activity(stmt)
            elif isinstance(stmt, RegulateAmount):
                self._add_regulate_amount(stmt)
            elif isinstance(stmt, Modification):
                self._add_modification(stmt)
            elif isinstance(stmt, SelfModification):
                self._add_selfmodification(stmt)
            elif isinstance(stmt, Gef):
                self._add_gef(stmt)
            elif isinstance(stmt, Gap):
                self._add_gap(stmt)
            elif isinstance(stmt, Complex):
                self._add_complex(stmt)
            else:
                logger.warning('Unhandled statement type: %s' %
                               stmt.__class__.__name__)
        if kwargs.get('grouping'):
            self._group_nodes()
            self._group_edges()
        return self.print_cyjs_graph()

[docs]    def get_gene_names(self):
        """Gather gene names of all nodes and node members"""
        # Collect all gene names in network
        gene_names = []
        for node in self._nodes:
            members = node['data'].get('members')
            if members:
                gene_names += list(members.keys())
            else:
                if node['data']['name'].startswith('Group'):
                    continue
                gene_names.append(node['data']['name'])
        self._gene_names = gene_names

[docs]    def set_CCLE_context(self, cell_types):
        """Set context of all nodes and node members from CCLE."""
        self.get_gene_names()

        # Get expression and mutations from context client
        exp_values = \
            context_client.get_protein_expression(self._gene_names, cell_types)
        mut_values = \
            context_client.get_mutations(self._gene_names, cell_types)

        # Make a dict of presence/absence of mutations
        muts = {cell_line: {} for cell_line in cell_types}
        for cell_line, entries in mut_values.items():
            if entries is not None:
                for gene, mutations in entries.items():
                    if mutations:
                        muts[cell_line][gene] = 1
                    else:
                        muts[cell_line][gene] = 0

        # Create bins for the exp values
        # because colorbrewer only does 3-9 bins and I don't feel like
        # reinventing color scheme theory, this will only bin 3-9 bins
        def bin_exp(expression_dict):
            d = expression_dict
            exp_values = []
            for line in d:
                for gene in d[line]:
                    val = d[line][gene]
                    if val is not None:
                        exp_values.append(val)
            thr_dict = {}
            for n_bins in range(3, 10):
                bin_thr = np.histogram(np.log10(exp_values), n_bins)[1][1:]
                thr_dict[n_bins] = bin_thr
            # this dict isn't yet binned, that happens in the loop
            binned_dict = {x: deepcopy(expression_dict) for x in range(3, 10)}
            for n_bins in binned_dict:
                for line in binned_dict[n_bins]:
                    for gene in binned_dict[n_bins][line]:
                        # last bin is reserved for None
                        if binned_dict[n_bins][line][gene] is None:
                            binned_dict[n_bins][line][gene] = n_bins
                        else:
                            val = np.log10(binned_dict[n_bins][line][gene])
                            for thr_idx, thr in enumerate(thr_dict[n_bins]):
                                if val <= thr:
                                    binned_dict[n_bins][line][gene] = thr_idx
                                    break
            return binned_dict
        binned_exp = bin_exp(exp_values)

        context = {'bin_expression': binned_exp,
                   'mutation': muts}
        self._context['CCLE'] = context

[docs]    def print_cyjs_graph(self):
        """Return the assembled Cytoscape JS network as a json string.

        Returns
        -------
        cyjs_str : str
            A json string representation of the Cytoscape JS network.
        """
        cyjs_dict = {'edges': self._edges, 'nodes': self._nodes}
        cyjs_str = json.dumps(cyjs_dict, indent=1, sort_keys=True)
        return cyjs_str

[docs]    def print_cyjs_context(self):
        """Return a list of node names and their respective context.

        Returns
        -------
        cyjs_str_context : str
            A json string of the context dictionary. e.g. -
            {'CCLE' : {'bin_expression' : {'cell_line1' : {'gene1':'val1'} },
            'bin_expression' : {'cell_line' : {'gene1':'val1'} }
            }}
        """
        context = self._context
        context_str = json.dumps(context, indent=1, sort_keys=True)
        return context_str

[docs]    def save_json(self, fname_prefix='model'):
        """Save the assembled Cytoscape JS network in a json file.

        This method saves two files based on the file name prefix given.
        It saves one json file with the graph itself, and another json
        file with the context.

        Parameters
        ----------
        fname_prefix : Optional[str]
            The prefix of the files to save the Cytoscape JS network and
            context to.
            Default: model
        """
        cyjs_str = self.print_cyjs_graph()
        # outputs the graph
        with open(fname_prefix + '.json', 'wb') as fh:
            fh.write(cyjs_str.encode('utf-8'))
        # outputs the context of graph nodes
        context_str = self.print_cyjs_context()
        with open(fname_prefix + '_context.json', 'wb') as fh:
            fh.write(context_str.encode('utf-8'))

[docs]    def save_model(self, fname='model.js'):
        """Save the assembled Cytoscape JS network in a js file.

        Parameters
        ----------
        file_name : Optional[str]
            The name of the file to save the Cytoscape JS network to.
            Default: model.js
        """
        exp_colorscale_str = json.dumps(self._exp_colorscale)
        mut_colorscale_str = json.dumps(self._mut_colorscale)
        cyjs_dict = {'edges': self._edges, 'nodes': self._nodes}
        model_str = json.dumps(cyjs_dict, indent=1, sort_keys=True)
        model_dict = {'exp_colorscale_str': exp_colorscale_str,
                      'mut_colorscale_str': mut_colorscale_str,
                      'model_elements_str': model_str}
        s = ''
        s += 'var exp_colorscale = %s;\n' % model_dict['exp_colorscale_str']
        s += 'var mut_colorscale = %s;\n' % model_dict['mut_colorscale_str']
        s += 'var model_elements = %s;\n' % model_dict['model_elements_str']
        with open(fname, 'wb') as fh:
            fh.write(s.encode('utf-8'))

    def _add_binary_regulation(self, stmt):
        subj, obj = stmt.agent_list()
        if subj is None:
            return
        edge_type, edge_polarity = _get_stmt_type(stmt)
        source_id = self._add_node(subj, uuid=stmt.uuid)
        target_id = self._add_node(obj, uuid=stmt.uuid)
        self._add_edge(edge_type, source_id, target_id, edge_polarity,
                       stmt.uuid)

    _add_regulate_activity = _add_binary_regulation
    _add_regulate_amount = _add_binary_regulation
    _add_modification = _add_binary_regulation
    _add_gef = _add_binary_regulation
    _add_gap = _add_binary_regulation

    def _add_selfmodification(self, stmt):
        edge_type, edge_polarity = _get_stmt_type(stmt)
        source_id = self._add_node(stmt.enz, uuid=stmt.uuid)
        self._add_edge(edge_type, source_id, source_id, edge_polarity,
                       stmt.uuid)

    def _add_complex(self, stmt):
        edge_type, edge_polarity = _get_stmt_type(stmt)
        for m1, m2 in itertools.combinations(stmt.members, 2):
            m1_id = self._add_node(m1, uuid=stmt.uuid)
            m2_id = self._add_node(m2, uuid=stmt.uuid)
            self._add_edge(edge_type, m1_id, m2_id, edge_polarity,
                           stmt.uuid)

    def _get_edge_dict(self):
        """Return a dict of edges.

        Keyed tuples of (i, source, target, polarity)
        with lists of edge ids [id1, id2, ...]
        """
        edge_dict = collections.defaultdict(lambda: [])
        if len(self._edges) > 0:
            for e in self._edges:
                data = e['data']
                key = tuple([data['i'], data['source'],
                            data['target'], data['polarity']])
                edge_dict[key] = data['id']
        return edge_dict

    def _add_edge(self, edge_type, source, target, edge_polarity, uuid):
        edge_dict = self._get_edge_dict()
        uuids = collections.defaultdict(lambda: [])
        edge = {'data': {'i': edge_type,
                         'source': source, 'target': target,
                         'polarity': edge_polarity}}
        data = edge['data']
        key = tuple([data['i'], data['source'],
                    data['target'], data['polarity']])
        if key in edge_dict:
            val = edge_dict[key]
            edge = [e for e in self._edges if e['data']['id'] == val][0]
        else:
            edge['data']['id'] = self._get_new_id()
            self._edges.append(edge)
        if type(uuid) is not list:
            uuid = [uuid]
        edge['data']['uuid_list'] = edge['data'].get('uuid_list', [])
        edge['data']['uuid_list'] += uuid
        return

    def _add_node(self, agent, uuid=None):
        node_key = agent.name
        node_id = self._existing_nodes.get(node_key)
        # if the node already exists we do not want to add it again
        # we must however add its uuid
        if node_id is not None:
            # fetch the appropriate node
            n = [x for x in self._nodes if x['data']['id'] == node_id][0]
            uuid_list = n['data']['uuid_list']
            if uuid not in uuid_list:
                uuid_list.append(uuid)
            return node_id
        db_refs = _get_db_refs(agent)
        node_id = self._get_new_id()
        self._existing_nodes[node_key] = node_id
        node_name = agent.name
        node_name = node_name.replace('_', ' ')
        if 'FPLX' in db_refs:
            expanded_families = bio_ontology.get_children(*agent.get_grounding(),
                                                          ns_filter={'HGNC'})
        else:
            expanded_families = []
        members = {}
        for member in expanded_families:
            member_db_refs = {member[0]: member[1]}
            member_db_refs = standardize_db_refs(member_db_refs)
            gene_name = bio_ontology.get_name(*member)
            members[gene_name] = {'db_refs': {}}
            for dbns, dbid in member_db_refs.items():
                url = get_identifiers_url(dbns, dbid)
                if url:
                    members[gene_name]['db_refs'][dbns] = url
        node = {'data': {'id': node_id, 'name': node_name,
                         'db_refs': db_refs, 'parent': '',
                         'members': members, 'uuid_list': [uuid]}}
        self._nodes.append(node)
        return node_id

    def _get_new_id(self):
        ret = self._id_counter
        self._id_counter += 1
        return ret

    def _get_node_key(self, node_dict_item):
        """Return a tuple of sorted sources and targets given a node dict."""
        s = tuple(sorted(node_dict_item['sources']))
        t = tuple(sorted(node_dict_item['targets']))
        return (s, t)

    def _get_node_groups(self):
        """Return a list of node id lists that are topologically identical.

        First construct a node_dict which is keyed to the node id and
        has a value which is a dict with keys 'sources' and 'targets'.
        The 'sources' and 'targets' each contain a list of tuples
        (i, polarity, source) edge of the node. node_dict is then processed
        by _get_node_key() which returns a tuple of (s,t) where s,t are
        sorted tuples of the ids for the source and target nodes. (s,t) is
        then used as a key in node_key_dict where the values are the node
        ids. node_groups is restricted to groups greater than 1 node.
        """
        node_dict = {node['data']['id']: {'sources': [], 'targets': []}
                     for node in self._nodes}
        for edge in self._edges:
            # Add edge as a source for its target node
            edge_data = (edge['data']['i'], edge['data']['polarity'],
                         edge['data']['source'])
            node_dict[edge['data']['target']]['sources'].append(edge_data)
            # Add edge as target for its source node
            edge_data = (edge['data']['i'], edge['data']['polarity'],
                         edge['data']['target'])
            node_dict[edge['data']['source']]['targets'].append(edge_data)
        # Make a dictionary of nodes based on source/target as a key
        node_key_dict = collections.defaultdict(lambda: [])
        for node_id, node_d in node_dict.items():
            key = self._get_node_key(node_d)
            node_key_dict[key].append(node_id)
        # Constrain the groups to ones that have more than 1 member
        node_groups = [g for g in node_key_dict.values() if (len(g) > 1)]
        return node_groups

    def _group_edges(self):
        """Group all edges that are topologically identical.

        This means that (i, source, target, polarity) are the same, then sets
        edges on parent (i.e. - group) nodes to 'Virtual' and creates a new
        edge to represent all of them.
        """
        # edit edges on parent nodes and make new edges for them
        edges_to_add = [[], []]  # [group_edges, uuid_lists]
        for e in self._edges:
            new_edge = deepcopy(e)
            new_edge['data'].pop('id', None)
            uuid_list = new_edge['data'].pop('uuid_list', [])
            # Check if edge source or target are contained in a parent
            # If source or target in parent edit edge
            # Nodes may only point within their container
            source = e['data']['source']
            target = e['data']['target']
            source_node = [x for x in self._nodes if
                           x['data']['id'] == source][0]
            target_node = [x for x in self._nodes if
                           x['data']['id'] == target][0]
            # If the source node is in a group, we change the source of this
            # edge to the group
            if source_node['data']['parent'] != '':
                new_edge['data']['source'] = source_node['data']['parent']
                e['data']['i'] = 'Virtual'
            # If the targete node is in a group, we change the target of this
            # edge to the group
            if target_node['data']['parent'] != '':
                new_edge['data']['target'] = target_node['data']['parent']
                e['data']['i'] = 'Virtual'
            if e['data']['i'] == 'Virtual':
                if new_edge not in edges_to_add[0]:
                    edges_to_add[0].append(new_edge)
                    edges_to_add[1].append(uuid_list)
                else:
                    idx = edges_to_add[0].index(new_edge)
                    edges_to_add[1][idx] += uuid_list
                    edges_to_add[1][idx] = list(set(edges_to_add[1][idx]))
        for ze in zip(*edges_to_add):
            edge = ze[0]
            edge['data']['id'] = self._get_new_id()
            edge['data']['uuid_list'] = ze[1]
            self._edges.append(edge)

    def _group_nodes(self):
        node_groups = self._get_node_groups()
        for group in node_groups:
            # Make new group node
            new_group_node = {'data': {'id': (self._get_new_id()),
                                       'name': ('Group' + str(group)),
                                       'parent': '', 'uuid_list': []}}
            member_nodes = [x for x in self._nodes if x['data']['id'] in group]
            for m_node in member_nodes:
                new_group_node['data']['uuid_list'] += \
                    m_node['data']['uuid_list']
                new_group_node['data']['uuid_list'] = \
                    list(set(new_group_node['data']['uuid_list']))
            # Point the node to its parent
            for node in self._nodes:
                if node['data']['id'] in group:
                    node['data']['parent'] = new_group_node['data']['id']
            self._nodes.append(new_group_node)


def _get_db_refs(agent):
    cyjs_db_refs = {}
    for db_name, db_ids in agent.db_refs.items():
        if isinstance(db_ids, int):
            db_id = str(db_ids)
        elif isinstance(db_ids, str):
            db_id = db_ids
        else:
            db_id = db_ids[0]
        if db_name == 'TEXT':
            url = db_id
        else:
            url = get_identifiers_url(db_name, db_id)
        if not url:
            continue
        db_name_map = {
            'UP': 'UniProt', 'PUBCHEM': 'PubChem',
            'IP': 'InterPro', 'NXPFA': 'NextProtFamily',
            'PF': 'Pfam', 'CHEBI': 'ChEBI'}
        name = db_name_map.get(db_name)
        if not name:
            name = db_name
        cyjs_db_refs[name] = url
    return cyjs_db_refs


def _get_stmt_type(stmt):
    if isinstance(stmt, AddModification):
        edge_type = stmt.__class__.__name__
        edge_polarity = 'positive'
    elif isinstance(stmt, RemoveModification):
        edge_type = stmt.__class__.__name__
        edge_polarity = 'negative'
    elif isinstance(stmt, SelfModification):
        edge_type = 'SelfModification'
        edge_polarity = 'positive'
    elif isinstance(stmt, Complex):
        edge_type = 'Complex'
        edge_polarity = 'none'
    elif isinstance(stmt, Activation):
        edge_type = 'Activation'
        edge_polarity = 'positive'
    elif isinstance(stmt, Inhibition):
        edge_type = 'Inhibition'
        edge_polarity = 'negative'
    elif isinstance(stmt, DecreaseAmount):
        edge_type = 'DecreaseAmount'
        edge_polarity = 'negative'
    elif isinstance(stmt, IncreaseAmount):
        edge_type = 'IncreaseAmount'
        edge_polarity = 'positive'
    elif isinstance(stmt, Gef):
        edge_type = 'Gef'
        edge_polarity = 'positive'
    elif isinstance(stmt, Gap):
        edge_type = 'Gap'
        edge_polarity = 'negative'
    else:
        edge_type = stmt.__class__.__str__()
        edge_polarity = 'none'
    return edge_type, edge_polarity