Source code for indra.assemblers.cyjs.assembler

import json
import logging
import itertools
import collections
import numpy as np
from copy import deepcopy
from indra.statements import *
from indra.databases import context_client, get_identifiers_url
from import Expander
from import bio_ontology
from indra.ontology.standardize import \

expander = Expander(ontology=bio_ontology)

logger = logging.getLogger(__name__)

[docs]class CyJSAssembler(object): """This class assembles a CytoscapeJS graph from a set of INDRA Statements. CytoscapeJS is a web-based network library for analysis and visualisation: Parameters ---------- statements : Optional[list[indra.statements.Statement]] A list of INDRA Statements to be assembled. Attributes ---------- statements : list[indra.statements.Statement] A list of INDRA Statements to be assembled. """ def __init__(self, stmts=None): if not stmts: self.statements = [] else: self.statements = stmts self._edges = [] self._nodes = [] self._existing_nodes = {} self._id_counter = 0 self._exp_colorscale = [] self._mut_colorscale = [] self._gene_names = [] self._context = {}
[docs] def add_statements(self, stmts): """Add INDRA Statements to the assembler's list of statements. Parameters ---------- stmts : list[indra.statements.Statement] A list of :py:class:`indra.statements.Statement` to be added to the statement list of the assembler. """ for stmt in stmts: self.statements.append(stmt)
[docs] def make_model(self, *args, **kwargs): """Assemble a Cytoscape JS network from INDRA Statements. This method assembles a Cytoscape JS network from the set of INDRA Statements added to the assembler. Parameters ---------- grouping : bool If True, the nodes with identical incoming and outgoing edges are grouped and the corresponding edges are merged. Returns ------- cyjs_str : str The json serialized Cytoscape JS model. """ for stmt in self.statements: if isinstance(stmt, RegulateActivity): self._add_regulate_activity(stmt) elif isinstance(stmt, RegulateAmount): self._add_regulate_amount(stmt) elif isinstance(stmt, Modification): self._add_modification(stmt) elif isinstance(stmt, SelfModification): self._add_selfmodification(stmt) elif isinstance(stmt, Gef): self._add_gef(stmt) elif isinstance(stmt, Gap): self._add_gap(stmt) elif isinstance(stmt, Complex): self._add_complex(stmt) else: logger.warning('Unhandled statement type: %s' % stmt.__class__.__name__) if kwargs.get('grouping'): self._group_nodes() self._group_edges() return self.print_cyjs_graph()
[docs] def get_gene_names(self): """Gather gene names of all nodes and node members""" # Collect all gene names in network gene_names = [] for node in self._nodes: members = node['data'].get('members') if members: gene_names += list(members.keys()) else: if node['data']['name'].startswith('Group'): continue gene_names.append(node['data']['name']) self._gene_names = gene_names
[docs] def set_CCLE_context(self, cell_types): """Set context of all nodes and node members from CCLE.""" self.get_gene_names() # Get expression and mutations from context client exp_values = \ context_client.get_protein_expression(self._gene_names, cell_types) mut_values = \ context_client.get_mutations(self._gene_names, cell_types) # Make a dict of presence/absence of mutations muts = {cell_line: {} for cell_line in cell_types} for cell_line, entries in mut_values.items(): if entries is not None: for gene, mutations in entries.items(): if mutations: muts[cell_line][gene] = 1 else: muts[cell_line][gene] = 0 # Create bins for the exp values # because colorbrewer only does 3-9 bins and I don't feel like # reinventing color scheme theory, this will only bin 3-9 bins def bin_exp(expression_dict): d = expression_dict exp_values = [] for line in d: for gene in d[line]: val = d[line][gene] if val is not None: exp_values.append(val) thr_dict = {} for n_bins in range(3, 10): bin_thr = np.histogram(np.log10(exp_values), n_bins)[1][1:] thr_dict[n_bins] = bin_thr # this dict isn't yet binned, that happens in the loop binned_dict = {x: deepcopy(expression_dict) for x in range(3, 10)} for n_bins in binned_dict: for line in binned_dict[n_bins]: for gene in binned_dict[n_bins][line]: # last bin is reserved for None if binned_dict[n_bins][line][gene] is None: binned_dict[n_bins][line][gene] = n_bins else: val = np.log10(binned_dict[n_bins][line][gene]) for thr_idx, thr in enumerate(thr_dict[n_bins]): if val <= thr: binned_dict[n_bins][line][gene] = thr_idx break return binned_dict binned_exp = bin_exp(exp_values) context = {'bin_expression': binned_exp, 'mutation': muts} self._context['CCLE'] = context
[docs] def print_cyjs_graph(self): """Return the assembled Cytoscape JS network as a json string. Returns ------- cyjs_str : str A json string representation of the Cytoscape JS network. """ cyjs_dict = {'edges': self._edges, 'nodes': self._nodes} cyjs_str = json.dumps(cyjs_dict, indent=1, sort_keys=True) return cyjs_str
[docs] def print_cyjs_context(self): """Return a list of node names and their respective context. Returns ------- cyjs_str_context : str A json string of the context dictionary. e.g. - {'CCLE' : {'bin_expression' : {'cell_line1' : {'gene1':'val1'} }, 'bin_expression' : {'cell_line' : {'gene1':'val1'} } }} """ context = self._context context_str = json.dumps(context, indent=1, sort_keys=True) return context_str
[docs] def save_json(self, fname_prefix='model'): """Save the assembled Cytoscape JS network in a json file. This method saves two files based on the file name prefix given. It saves one json file with the graph itself, and another json file with the context. Parameters ---------- fname_prefix : Optional[str] The prefix of the files to save the Cytoscape JS network and context to. Default: model """ cyjs_str = self.print_cyjs_graph() # outputs the graph with open(fname_prefix + '.json', 'wb') as fh: fh.write(cyjs_str.encode('utf-8')) # outputs the context of graph nodes context_str = self.print_cyjs_context() with open(fname_prefix + '_context.json', 'wb') as fh: fh.write(context_str.encode('utf-8'))
[docs] def save_model(self, fname='model.js'): """Save the assembled Cytoscape JS network in a js file. Parameters ---------- file_name : Optional[str] The name of the file to save the Cytoscape JS network to. Default: model.js """ exp_colorscale_str = json.dumps(self._exp_colorscale) mut_colorscale_str = json.dumps(self._mut_colorscale) cyjs_dict = {'edges': self._edges, 'nodes': self._nodes} model_str = json.dumps(cyjs_dict, indent=1, sort_keys=True) model_dict = {'exp_colorscale_str': exp_colorscale_str, 'mut_colorscale_str': mut_colorscale_str, 'model_elements_str': model_str} s = '' s += 'var exp_colorscale = %s;\n' % model_dict['exp_colorscale_str'] s += 'var mut_colorscale = %s;\n' % model_dict['mut_colorscale_str'] s += 'var model_elements = %s;\n' % model_dict['model_elements_str'] with open(fname, 'wb') as fh: fh.write(s.encode('utf-8'))
def _add_binary_regulation(self, stmt): subj, obj = stmt.agent_list() if subj is None: return edge_type, edge_polarity = _get_stmt_type(stmt) source_id = self._add_node(subj, uuid=stmt.uuid) target_id = self._add_node(obj, uuid=stmt.uuid) self._add_edge(edge_type, source_id, target_id, edge_polarity, stmt.uuid) _add_regulate_activity = _add_binary_regulation _add_regulate_amount = _add_binary_regulation _add_modification = _add_binary_regulation _add_gef = _add_binary_regulation _add_gap = _add_binary_regulation def _add_selfmodification(self, stmt): edge_type, edge_polarity = _get_stmt_type(stmt) source_id = self._add_node(stmt.enz, uuid=stmt.uuid) self._add_edge(edge_type, source_id, source_id, edge_polarity, stmt.uuid) def _add_complex(self, stmt): edge_type, edge_polarity = _get_stmt_type(stmt) for m1, m2 in itertools.combinations(stmt.members, 2): m1_id = self._add_node(m1, uuid=stmt.uuid) m2_id = self._add_node(m2, uuid=stmt.uuid) self._add_edge(edge_type, m1_id, m2_id, edge_polarity, stmt.uuid) def _get_edge_dict(self): """Return a dict of edges. Keyed tuples of (i, source, target, polarity) with lists of edge ids [id1, id2, ...] """ edge_dict = collections.defaultdict(lambda: []) if len(self._edges) > 0: for e in self._edges: data = e['data'] key = tuple([data['i'], data['source'], data['target'], data['polarity']]) edge_dict[key] = data['id'] return edge_dict def _add_edge(self, edge_type, source, target, edge_polarity, uuid): edge_dict = self._get_edge_dict() uuids = collections.defaultdict(lambda: []) edge = {'data': {'i': edge_type, 'source': source, 'target': target, 'polarity': edge_polarity}} data = edge['data'] key = tuple([data['i'], data['source'], data['target'], data['polarity']]) if key in edge_dict: val = edge_dict[key] edge = [e for e in self._edges if e['data']['id'] == val][0] else: edge['data']['id'] = self._get_new_id() self._edges.append(edge) if type(uuid) is not list: uuid = [uuid] edge['data']['uuid_list'] = edge['data'].get('uuid_list', []) edge['data']['uuid_list'] += uuid return def _add_node(self, agent, uuid=None): node_key = node_id = self._existing_nodes.get(node_key) # if the node already exists we do not want to add it again # we must however add its uuid if node_id is not None: # fetch the appropriate node n = [x for x in self._nodes if x['data']['id'] == node_id][0] uuid_list = n['data']['uuid_list'] if uuid not in uuid_list: uuid_list.append(uuid) return node_id db_refs = _get_db_refs(agent) node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node_name = node_name = node_name.replace('_', ' ') if 'FPLX' in db_refs: expanded_families = bio_ontology.get_children(*agent.get_grounding(), ns_filter={'HGNC'}) else: expanded_families = [] members = {} for member in expanded_families: member_db_refs = {member[0]: member[1]} member_db_refs = standardize_db_refs(member_db_refs) gene_name = bio_ontology.get_name(*member) members[gene_name] = {'db_refs': {}} for dbns, dbid in member_db_refs.items(): url = get_identifiers_url(dbns, dbid) if url: members[gene_name]['db_refs'][dbns] = url node = {'data': {'id': node_id, 'name': node_name, 'db_refs': db_refs, 'parent': '', 'members': members, 'uuid_list': [uuid]}} self._nodes.append(node) return node_id def _get_new_id(self): ret = self._id_counter self._id_counter += 1 return ret def _get_node_key(self, node_dict_item): """Return a tuple of sorted sources and targets given a node dict.""" s = tuple(sorted(node_dict_item['sources'])) t = tuple(sorted(node_dict_item['targets'])) return (s, t) def _get_node_groups(self): """Return a list of node id lists that are topologically identical. First construct a node_dict which is keyed to the node id and has a value which is a dict with keys 'sources' and 'targets'. The 'sources' and 'targets' each contain a list of tuples (i, polarity, source) edge of the node. node_dict is then processed by _get_node_key() which returns a tuple of (s,t) where s,t are sorted tuples of the ids for the source and target nodes. (s,t) is then used as a key in node_key_dict where the values are the node ids. node_groups is restricted to groups greater than 1 node. """ node_dict = {node['data']['id']: {'sources': [], 'targets': []} for node in self._nodes} for edge in self._edges: # Add edge as a source for its target node edge_data = (edge['data']['i'], edge['data']['polarity'], edge['data']['source']) node_dict[edge['data']['target']]['sources'].append(edge_data) # Add edge as target for its source node edge_data = (edge['data']['i'], edge['data']['polarity'], edge['data']['target']) node_dict[edge['data']['source']]['targets'].append(edge_data) # Make a dictionary of nodes based on source/target as a key node_key_dict = collections.defaultdict(lambda: []) for node_id, node_d in node_dict.items(): key = self._get_node_key(node_d) node_key_dict[key].append(node_id) # Constrain the groups to ones that have more than 1 member node_groups = [g for g in node_key_dict.values() if (len(g) > 1)] return node_groups def _group_edges(self): """Group all edges that are topologically identical. This means that (i, source, target, polarity) are the same, then sets edges on parent (i.e. - group) nodes to 'Virtual' and creates a new edge to represent all of them. """ # edit edges on parent nodes and make new edges for them edges_to_add = [[], []] # [group_edges, uuid_lists] for e in self._edges: new_edge = deepcopy(e) new_edge['data'].pop('id', None) uuid_list = new_edge['data'].pop('uuid_list', []) # Check if edge source or target are contained in a parent # If source or target in parent edit edge # Nodes may only point within their container source = e['data']['source'] target = e['data']['target'] source_node = [x for x in self._nodes if x['data']['id'] == source][0] target_node = [x for x in self._nodes if x['data']['id'] == target][0] # If the source node is in a group, we change the source of this # edge to the group if source_node['data']['parent'] != '': new_edge['data']['source'] = source_node['data']['parent'] e['data']['i'] = 'Virtual' # If the targete node is in a group, we change the target of this # edge to the group if target_node['data']['parent'] != '': new_edge['data']['target'] = target_node['data']['parent'] e['data']['i'] = 'Virtual' if e['data']['i'] == 'Virtual': if new_edge not in edges_to_add[0]: edges_to_add[0].append(new_edge) edges_to_add[1].append(uuid_list) else: idx = edges_to_add[0].index(new_edge) edges_to_add[1][idx] += uuid_list edges_to_add[1][idx] = list(set(edges_to_add[1][idx])) for ze in zip(*edges_to_add): edge = ze[0] edge['data']['id'] = self._get_new_id() edge['data']['uuid_list'] = ze[1] self._edges.append(edge) def _group_nodes(self): node_groups = self._get_node_groups() for group in node_groups: # Make new group node new_group_node = {'data': {'id': (self._get_new_id()), 'name': ('Group' + str(group)), 'parent': '', 'uuid_list': []}} member_nodes = [x for x in self._nodes if x['data']['id'] in group] for m_node in member_nodes: new_group_node['data']['uuid_list'] += \ m_node['data']['uuid_list'] new_group_node['data']['uuid_list'] = \ list(set(new_group_node['data']['uuid_list'])) # Point the node to its parent for node in self._nodes: if node['data']['id'] in group: node['data']['parent'] = new_group_node['data']['id'] self._nodes.append(new_group_node)
def _get_db_refs(agent): cyjs_db_refs = {} for db_name, db_ids in agent.db_refs.items(): if isinstance(db_ids, int): db_id = str(db_ids) elif isinstance(db_ids, str): db_id = db_ids else: db_id = db_ids[0] if db_name == 'TEXT': url = db_id else: url = get_identifiers_url(db_name, db_id) if not url: continue db_name_map = { 'UP': 'UniProt', 'PUBCHEM': 'PubChem', 'IP': 'InterPro', 'NXPFA': 'NextProtFamily', 'PF': 'Pfam', 'CHEBI': 'ChEBI'} name = db_name_map.get(db_name) if not name: name = db_name cyjs_db_refs[name] = url return cyjs_db_refs def _get_stmt_type(stmt): if isinstance(stmt, AddModification): edge_type = stmt.__class__.__name__ edge_polarity = 'positive' elif isinstance(stmt, RemoveModification): edge_type = stmt.__class__.__name__ edge_polarity = 'negative' elif isinstance(stmt, SelfModification): edge_type = 'SelfModification' edge_polarity = 'positive' elif isinstance(stmt, Complex): edge_type = 'Complex' edge_polarity = 'none' elif isinstance(stmt, Activation): edge_type = 'Activation' edge_polarity = 'positive' elif isinstance(stmt, Inhibition): edge_type = 'Inhibition' edge_polarity = 'negative' elif isinstance(stmt, DecreaseAmount): edge_type = 'DecreaseAmount' edge_polarity = 'negative' elif isinstance(stmt, IncreaseAmount): edge_type = 'IncreaseAmount' edge_polarity = 'positive' elif isinstance(stmt, Gef): edge_type = 'Gef' edge_polarity = 'positive' elif isinstance(stmt, Gap): edge_type = 'Gap' edge_polarity = 'negative' else: edge_type = stmt.__class__.__str__() edge_polarity = 'none' return edge_type, edge_polarity