Source code for indra.tools.assemble_corpus

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
import sys
try:
    # Python 2
    import cPickle as pickle
except ImportError:
    # Python 3
    import pickle
import logging
from copy import deepcopy, copy
from indra.statements import *
from indra.belief import BeliefEngine
from indra.util import read_unicode_csv
from indra.databases import uniprot_client
from indra.mechlinker import MechLinker
from indra.preassembler import Preassembler
from indra.tools.expand_families import Expander
from indra.preassembler.hierarchy_manager import hierarchies
from indra.preassembler.grounding_mapper import GroundingMapper
from indra.preassembler.grounding_mapper import gm as grounding_map
from indra.preassembler.sitemapper import SiteMapper, default_site_map

logger = logging.getLogger('assemble_corpus')
indra_logger = logging.getLogger('indra').setLevel(logging.DEBUG)

def _filter(kwargs, arg_list):
    return dict(filter(lambda x: x[0] in arg_list, kwargs.items()))

[docs]def dump_statements(stmts, fname): """Dump a list of statements into a pickle file. Parameters ---------- fname : str The name of the pickle file to dump statements into. """ if sys.version_info[0] < 3: logger.warning('Files pickled in Python 2 may be incompatible with ' 'Python 3') logger.info('Dumping %d statements into %s...' % (len(stmts), fname)) with open(fname, 'wb') as fh: pickle.dump(stmts, fh, protocol=2)
[docs]def load_statements(fname, as_dict=False): """Load statements from a pickle file. Parameters ---------- fname : str The name of the pickle file to load statements from. as_dict : Optional[bool] If True and the pickle file contains a dictionary of statements, it is returned as a dictionary. If False, the statements are always returned in a list. Default: False Returns ------- stmts : list A list or dict of statements that were loaded. """ logger.info('Loading %s...' % fname) with open(fname, 'rb') as fh: # Encoding argument not available in pickle for Python 2 if sys.version_info[0] < 3: stmts = pickle.load(fh) # Encoding argument specified here to enable compatibility with # pickle files created with Python 2 else: stmts = pickle.load(fh, encoding='latin1') if isinstance(stmts, dict): if as_dict: return stmts st = [] for pmid, st_list in stmts.items(): st += st_list stmts = st logger.info('Loaded %d statements' % len(stmts)) return stmts
[docs]def map_grounding(stmts_in, **kwargs): """Map grounding using the GroundingMapper. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to map. do_rename : Optional[bool] If True, Agents are renamed based on their mapped grounding. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of mapped statements. """ logger.info('Mapping grounding on %d statements...' % len(stmts_in)) do_rename = kwargs.get('do_rename') if do_rename is None: do_rename = True gm = GroundingMapper(grounding_map) stmts_out = gm.map_agents(stmts_in, do_rename=do_rename) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def map_sequence(stmts_in, **kwargs): """Map sequences using the SiteMapper. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to map. do_methionine_offset : boolean Whether to check for off-by-one errors in site position (possibly) attributable to site numbering from mature proteins after cleavage of the initial methionine. If True, checks the reference sequence for a known modification at 1 site position greater than the given one; if there exists such a site, creates the mapping. Default is True. do_orthology_mapping : boolean Whether to check sequence positions for known modification sites in mouse or rat sequences (based on PhosphoSitePlus data). If a mouse/rat site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. do_isoform_mapping : boolean Whether to check sequence positions for known modifications in other human isoforms of the protein (based on PhosphoSitePlus data). If a site is found that is linked to a site in the human reference sequence, a mapping is created. Default is True. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of mapped statements. """ logger.info('Mapping sites on %d statements...' % len(stmts_in)) kwarg_list = ['do_methionine_offset', 'do_orthology_mapping', 'do_isoform_mapping'] sm = SiteMapper(default_site_map) valid, mapped = sm.map_sites(stmts_in, **_filter(kwargs, kwarg_list)) correctly_mapped_stmts = [] for ms in mapped: if all([True if mm[1] is not None else False for mm in ms.mapped_mods]): correctly_mapped_stmts.append(ms.mapped_stmt) stmts_out = valid + correctly_mapped_stmts logger.info('%d statements with valid sites' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def run_preassembly(stmts_in, **kwargs): """Run preassembly on a list of statements. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to preassemble. return_toplevel : Optional[bool] If True, only the top-level statements are returned. If False, all statements are returned irrespective of level of specificity. Default: True poolsize : Optional[int] The number of worker processes to use to parallelize the comparisons performed by the function. If None (default), no parallelization is performed. NOTE: Parallelization is only available on Python 3.4 and above. size_cutoff : Optional[int] Groups with size_cutoff or more statements are sent to worker processes, while smaller groups are compared in the parent process. Default value is 100. Not relevant when parallelization is not used. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. save_unique : Optional[str] The name of a pickle file to save the unique statements into. Returns ------- stmts_out : list[indra.statements.Statement] A list of preassembled top-level statements. """ dump_pkl_unique = kwargs.get('save_unique') be = BeliefEngine() pa = Preassembler(hierarchies, stmts_in) run_preassembly_duplicate(pa, be, save=dump_pkl_unique) dump_pkl = kwargs.get('save') return_toplevel = kwargs.get('return_toplevel', True) poolsize = kwargs.get('poolsize', None) size_cutoff = kwargs.get('size_cutoff', 100) options = {'save': dump_pkl, 'return_toplevel': return_toplevel, 'poolsize': poolsize, 'size_cutoff': size_cutoff} stmts_out = run_preassembly_related(pa, be, **options) return stmts_out
[docs]def run_preassembly_duplicate(preassembler, beliefengine, **kwargs): """Run deduplication stage of preassembly on a list of statements. Parameters ---------- preassembler : indra.preassembler.Preassembler A Preassembler instance beliefengine : indra.belief.BeliefEngine A BeliefEngine instance save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of unique statements. """ logger.info('Combining duplicates on %d statements...' % len(preassembler.stmts)) dump_pkl = kwargs.get('save') stmts_out = preassembler.combine_duplicates() beliefengine.set_prior_probs(stmts_out) logger.info('%d unique statements' % len(stmts_out)) if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_by_type(stmts_in, stmt_type, **kwargs): """Filter to a given statement type. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. stmt_type : indra.statements.Statement The class of the statement type to filter for. Example: indra.statements.Modification invert : Optional[bool] If True, the statements that are not of the given type are returned. Default: False save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ invert = kwargs.get('invert', False) logger.info('Filtering %d statements for type %s%s...' % (len(stmts_in), 'not ' if invert else '', stmt_type.__name__)) if not invert: stmts_out = [st for st in stmts_in if isinstance(st, stmt_type)] else: stmts_out = [st for st in stmts_in if not isinstance(st, stmt_type)] logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_grounded_only(stmts_in, **kwargs): """Filter to statements that have grounded agents. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements for grounded agents...' % len(stmts_in)) stmts_out = [] for st in stmts_in: grounded = True for agent in st.agent_list(): if agent is not None: if (not agent.db_refs) or \ ((len(agent.db_refs) == 1) and agent.db_refs.get('TEXT')): grounded = False break if grounded: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_genes_only(stmts_in, **kwargs): """Filter to statements containing genes only. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. specific_only : Optional[bool] If True, only elementary genes/proteins will be kept and families will be filtered out. If False, families are also included in the output. Default: False save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ specific_only = kwargs.get('specific_only') logger.info('Filtering %d statements for ones containing genes only...' % len(stmts_in)) stmts_out = [] for st in stmts_in: genes_only = True for agent in st.agent_list(): if agent is not None: if not specific_only: if not(agent.db_refs.get('HGNC') or \ agent.db_refs.get('UP') or \ agent.db_refs.get('BE')): genes_only = False break else: if not(agent.db_refs.get('HGNC') or \ agent.db_refs.get('UP')): genes_only = False break if genes_only: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_belief(stmts_in, belief_cutoff, **kwargs): """Filter to statements with belief above a given cutoff. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. belief_cutoff : float Only statements with belief above the belief_cutoff will be returned. Here 0 < belief_cutoff < 1. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ dump_pkl = kwargs.get('save') logger.info('Filtering %d statements to above %f belief' % (len(stmts_in), belief_cutoff)) # The first round of filtering is in the top-level list stmts_out = [] # Now we eliminate supports/supported-by for stmt in stmts_in: if stmt.belief >= belief_cutoff: stmts_out.append(stmt) else: continue supp_by = [] supp = [] for st in stmt.supports: if st.belief >= belief_cutoff: supp.append(st) for st in stmt.supported_by: if st.belief >= belief_cutoff: supp_by.append(st) stmt.supports = supp stmt.supported_by = supp_by logger.info('%d statements after filter...' % len(stmts_out)) if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_gene_list(stmts_in, gene_list, policy, allow_families=False, **kwargs): """Return statements that contain genes given in a list. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. gene_list : list[str] A list of gene symbols to filter for. policy : str The policy to apply when filtering for the list of genes. "one": keep statements that contain at least one of the list of genes and possibly others not in the list "all": keep statements that only contain genes given in the list allow_families : Optional[bool] Will include statements involving Bioentities families containing one of the genes in the gene list. Default: False save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ if policy not in ('one', 'all'): logger.error('Policy %s is invalid, not applying filter.' % policy) else: genes_str = ', '.join(gene_list) logger.info('Filtering %d statements for ones containing "%s" of: ' '%s...' % (len(stmts_in), policy, genes_str)) # If we're allowing families, make a list of all Bioentities IDs that # contain members of the gene list, and add them to the filter list filter_list = copy(gene_list) if allow_families: for hgnc_name in gene_list: gene_uri = hierarchies['entity'].get_uri('HGNC', hgnc_name) parents = hierarchies['entity'].get_parents(gene_uri) for par_uri in parents: ns, id = hierarchies['entity'].ns_id_from_uri(par_uri) filter_list.append(id) stmts_out = [] if policy == 'one': for st in stmts_in: found_gene = False for agent in st.agent_list(): if agent is not None: if agent.name in filter_list: found_gene = True break if found_gene: stmts_out.append(st) elif policy == 'all': for st in stmts_in: found_genes = True for agent in st.agent_list(): if agent is not None: if agent.name not in filter_list: found_genes = False break if found_genes: stmts_out.append(st) else: stmts_out = stmts_in logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_human_only(stmts_in, **kwargs): """Filter out statements that are not grounded to human genes. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ dump_pkl = kwargs.get('save') logger.info('Filtering %d statements for human genes only...' % len(stmts_in)) stmts_out = [] for st in stmts_in: human_genes = True for agent in st.agent_list(): if agent is not None: upid = agent.db_refs.get('UP') if upid and not uniprot_client.is_human(upid): human_genes = False break if human_genes: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_direct(stmts_in, **kwargs): """Filter to statements that are direct interactions Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ def get_is_direct(stmt): """Returns true if there is evidence that the statement is a direct interaction. If any of the evidences associated with the statement indicates a direct interatcion then we assume the interaction is direct. If there is no evidence for the interaction being indirect then we default to direct. """ any_indirect = False for ev in stmt.evidence: if ev.epistemics.get('direct') is True: return True elif ev.epistemics.get('direct') is False: # This guarantees that we have seen at least # some evidence that the statement is indirect any_indirect = True if any_indirect: return False return True logger.info('Filtering %d statements to direct ones...' % len(stmts_in)) stmts_out = [] for st in stmts_in: if get_is_direct(st): stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_no_hypothesis(stmts_in, **kwargs): """Filter to statements that are not marked as hypothesis in epistemics. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements to no hypothesis...' % len(stmts_in)) stmts_out = [] for st in stmts_in: all_hypotheses = True ev = None for ev in st.evidence: if not ev.epistemics.get('hypothesis', False): all_hypotheses = False break if ev is None: all_hypotheses = False if not all_hypotheses: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_evidence_source(stmts_in, source_apis, policy='one', **kwargs): """Filter to statements that have evidence from a given set of sources. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. source_apis : list[str] A list of sources to filter for. Examples: biopax, bel, reach policy : Optional[str] If 'one', a statement that hase evidence from any of the sources is kept. If 'all', only those statements are kept which have evidence from all the input sources specified in source_apis. If 'none', only those statements are kept that don't have evidence from any of the sources specified in source_apis. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements to evidence source "%s" of: %s...' % (len(stmts_in), policy, ', '.join(source_apis))) stmts_out = [] for st in stmts_in: sources = set([ev.source_api for ev in st.evidence]) if policy == 'one': if sources.intersection(source_apis): stmts_out.append(st) if policy == 'all': if sources.intersection(source_apis) == set(source_apis): stmts_out.append(st) if policy == 'none': if not sources.intersection(source_apis): stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_top_level(stmts_in, **kwargs): """Filter to statements that are at the top-level of the hierarchy. Here top-level statements correspond to most specific ones. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements for top-level...' % len(stmts_in)) stmts_out = [st for st in stmts_in if not st.supports] logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_inconsequential_mods(stmts_in, whitelist=None, **kwargs): """Filter out Modifications that modify inconsequential sites Inconsequential here means that the site is not mentioned / tested in any other statement. In some cases specific sites should be preserved, for instance, to be used as readouts in a model. In this case, the given sites can be passed in a whitelist. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. whitelist : Optional[dict] A whitelist containing agent modification sites whose modifications should be preserved even if no other statement refers to them. The whitelist parameter is a dictionary in which the key is a gene name and the value is a list of tuples of (modification_type, residue, position). Example: whitelist = {'MAP2K1': [('phosphorylation', 'S', '222')]} save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ if whitelist is None: whitelist = {} logger.info('Filtering %d statements to remove' % len(stmts_in) + ' inconsequential modifications...') states_used = whitelist for stmt in stmts_in: for agent in stmt.agent_list(): if agent is not None: if agent.mods: for mc in agent.mods: mod = (mc.mod_type, mc.residue, mc.position) try: states_used[agent.name].append(mod) except KeyError: states_used[agent.name] = [mod] for k, v in states_used.items(): states_used[k] = list(set(v)) stmts_out = [] for stmt in stmts_in: skip = False if isinstance(stmt, Modification): mod_type = modclass_to_modtype[stmt.__class__] if isinstance(stmt, RemoveModification): mod_type = modtype_to_inverse[mod_type] mod = (mod_type, stmt.residue, stmt.position) used = states_used.get(stmt.sub.name, []) if mod not in used: skip = True if not skip: stmts_out.append(stmt) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_inconsequential_acts(stmts_in, whitelist=None, **kwargs): """Filter out Activations that modify inconsequential activities Inconsequential here means that the site is not mentioned / tested in any other statement. In some cases specific activity types should be preserved, for instance, to be used as readouts in a model. In this case, the given activities can be passed in a whitelist. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. whitelist : Optional[dict] A whitelist containing agent activity types which should be preserved even if no other statement refers to them. The whitelist parameter is a dictionary in which the key is a gene name and the value is a list of activity types. Example: whitelist = {'MAP2K1': ['kinase']} save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ if whitelist is None: whitelist = {} logger.info('Filtering %d statements to remove' % len(stmts_in) + ' inconsequential activations...') states_used = whitelist for stmt in stmts_in: for agent in stmt.agent_list(): if agent is not None: if agent.activity: act = agent.activity.activity_type try: states_used[agent.name].append(act) except KeyError: states_used[agent.name] = [act] for k, v in states_used.items(): states_used[k] = list(set(v)) stmts_out = [] for stmt in stmts_in: skip = False if isinstance(stmt, RegulateActivity): used = states_used.get(stmt.obj.name, []) if stmt.obj_activity not in used: skip = True if not skip: stmts_out.append(stmt) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_mutation_status(stmts_in, mutations, deletions, **kwargs): """Filter statements based on existing mutations/deletions This filter helps to contextualize a set of statements to a given cell type. Given a list of deleted genes, it removes statements that refer to these genes. It also takes a list of mutations and removes statements that refer to mutations not relevant for the given context. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. mutations : dict A dictionary whose keys are gene names, and the values are lists of tuples of the form (residue_from, position, residue_to). Example: mutations = {'BRAF': [('V', '600', 'E')]} deletions : list A list of gene names that are deleted. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements for mutation status...' % len(stmts_in)) stmts_out = [] for stmt in stmts_in: skip = False for agent in stmt.agent_list(): if agent is not None and agent.name in deletions: skip = True break if agent is not None and agent.mutations: muts = mutations.get(agent.name, []) for mut in agent.mutations: mut_tup = (mut.residue_from, mut.position, mut.residue_to) if mut_tup not in muts: skip = True if skip: break if not skip: stmts_out.append(stmt) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_enzyme_kinase(stmts_in, **kwargs): """Filter Phosphorylations to ones where the enzyme is a known kinase. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements to remove ' % len(stmts_in) + 'phosphorylation by non-kinases...') path = os.path.dirname(os.path.abspath(__file__)) kinase_table = read_unicode_csv(path + '/../resources/kinases.tsv', delimiter='\t') gene_names = [lin[1] for lin in list(kinase_table)[1:]] stmts_out = [] for st in stmts_in: if isinstance(st, Phosphorylation): if st.enz is not None: if st.enz.name in gene_names: stmts_out.append(st) else: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_mod_nokinase(stmts_in, **kwargs): """Filter non-phospho Modifications to ones with a non-kinase enzyme. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements to remove ' % len(stmts_in) + 'non-phospho modifications by kinases...') path = os.path.dirname(os.path.abspath(__file__)) kinase_table = read_unicode_csv(path + '/../resources/kinases.tsv', delimiter='\t') gene_names = [lin[1] for lin in list(kinase_table)[1:]] stmts_out = [] for st in stmts_in: if isinstance(st, Modification) and not \ isinstance(st, Phosphorylation): if st.enz is not None: if st.enz.name not in gene_names: stmts_out.append(st) else: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_transcription_factor(stmts_in, **kwargs): """Filter out RegulateAmounts where subject is not a transcription factor. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements to remove ' % len(stmts_in) + 'amount regulations by non-transcription-factors...') path = os.path.dirname(os.path.abspath(__file__)) tf_table = \ read_unicode_csv(path + '/../resources/transcription_factors.csv') gene_names = [lin[1] for lin in list(tf_table)[1:]] stmts_out = [] for st in stmts_in: if isinstance(st, RegulateAmount): if st.subj is not None: if st.subj.name in gene_names: stmts_out.append(st) else: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def filter_uuid_list(stmts_in, uuids, **kwargs): """Filter to Statements corresponding to given UUIDs Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. uuids : list[str] A list of UUIDs to filter for. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ logger.info('Filtering %d statements for %d UUID%s...' % (len(stmts_in), len(uuids), 's' if len(uuids) > 1 else '')) stmts_out = [] for st in stmts_in: if st.uuid in uuids: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def expand_families(stmts_in, **kwargs): """Expand Bioentities Agents to individual genes. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to expand. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of expanded statements. """ logger.info('Expanding families on %d statements...' % len(stmts_in)) expander = Expander(hierarchies) stmts_out = expander.expand_families(stmts_in) logger.info('%d statements after expanding families...' % len(stmts_out)) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def reduce_activities(stmts_in, **kwargs): """Reduce the activity types in a list of statements Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to reduce activity types in. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of reduced activity statements. """ logger.info('Reducing activities on %d statements...' % len(stmts_in)) stmts_out = [deepcopy(st) for st in stmts_in] ml = MechLinker(stmts_out) ml.gather_explicit_activities() ml.reduce_activities() stmts_out = ml.statements dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def strip_agent_context(stmts_in, **kwargs): """Strip any context on agents within each statement. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements whose agent context should be stripped. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of stripped statements. """ logger.info('Stripping agent context on %d statements...' % len(stmts_in)) stmts_out = [] for st in stmts_in: new_st = deepcopy(st) for agent in new_st.agent_list(): if agent is None: continue agent.mods = [] agent.mutations = [] agent.activity = None agent.location = None agent.bound_conditions = [] stmts_out.append(new_st) dump_pkl = kwargs.get('save') if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
[docs]def dump_stmt_strings(stmts, fname): """Save printed statements in a file. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to save in a text file. fname : Optional[str] The name of a text file to save the printed statements into. """ with open(fname, 'wb') as fh: for st in stmts: fh.write(('%s\n' % st).encode('utf-8'))
if __name__ == '__main__': if len(sys.argv) < 3: logger.error('Usage: assemble_corpus.py <pickle_file> <output_folder>') sys.exit() stmts_fname = sys.argv[1] out_folder = sys.argv[2] stmts = load_statements(stmts_fname) logger.info('All statements: %d' % len(stmts)) cache_pkl = os.path.join(out_folder, 'mapped_stmts.pkl') options = {'save': cache_pkl, 'do_rename': True} stmts = map_grounding(stmts, **options) cache_pkl = os.path.join(out_folder, 'sequence_valid_stmts.pkl') options = {'save': cache_pkl} mapped_stmts = map_sequence(stmts, **options) be = BeliefEngine() pa = Preassembler(hierarchies, mapped_stmts) cache_pkl = os.path.join(out_folder, 'unique_stmts.pkl') options = {'save': cache_pkl} unique_stmts = run_preassembly_duplicate(pa, be, **options) cache_pkl = os.path.join(out_folder, 'top_stmts.pkl') options = {'save': cache_pkl} stmts = run_preassembly_related(pa, be, **options)