from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
import sys
try:
# Python 2
import cPickle as pickle
except ImportError:
# Python 3
import pickle
import logging
from copy import deepcopy, copy
from indra.statements import *
from indra.belief import BeliefEngine
from indra.util import read_unicode_csv
from indra.databases import uniprot_client
from indra.mechlinker import MechLinker
from indra.preassembler import Preassembler
from indra.tools.expand_families import Expander
from indra.preassembler.hierarchy_manager import hierarchies
from indra.preassembler.grounding_mapper import GroundingMapper
from indra.preassembler.grounding_mapper import gm as grounding_map
from indra.preassembler.sitemapper import SiteMapper, default_site_map
logger = logging.getLogger('assemble_corpus')
indra_logger = logging.getLogger('indra').setLevel(logging.DEBUG)
def _filter(kwargs, arg_list):
return dict(filter(lambda x: x[0] in arg_list, kwargs.items()))
[docs]def dump_statements(stmts, fname):
"""Dump a list of statements into a pickle file.
Parameters
----------
fname : str
The name of the pickle file to dump statements into.
"""
if sys.version_info[0] < 3:
logger.warning('Files pickled in Python 2 may be incompatible with '
'Python 3')
logger.info('Dumping %d statements into %s...' % (len(stmts), fname))
with open(fname, 'wb') as fh:
pickle.dump(stmts, fh, protocol=2)
[docs]def load_statements(fname, as_dict=False):
"""Load statements from a pickle file.
Parameters
----------
fname : str
The name of the pickle file to load statements from.
as_dict : Optional[bool]
If True and the pickle file contains a dictionary of statements, it
is returned as a dictionary. If False, the statements are always
returned in a list. Default: False
Returns
-------
stmts : list
A list or dict of statements that were loaded.
"""
logger.info('Loading %s...' % fname)
with open(fname, 'rb') as fh:
# Encoding argument not available in pickle for Python 2
if sys.version_info[0] < 3:
stmts = pickle.load(fh)
# Encoding argument specified here to enable compatibility with
# pickle files created with Python 2
else:
stmts = pickle.load(fh, encoding='latin1')
if isinstance(stmts, dict):
if as_dict:
return stmts
st = []
for pmid, st_list in stmts.items():
st += st_list
stmts = st
logger.info('Loaded %d statements' % len(stmts))
return stmts
[docs]def map_grounding(stmts_in, **kwargs):
"""Map grounding using the GroundingMapper.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to map.
do_rename : Optional[bool]
If True, Agents are renamed based on their mapped grounding.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of mapped statements.
"""
logger.info('Mapping grounding on %d statements...' % len(stmts_in))
do_rename = kwargs.get('do_rename')
if do_rename is None:
do_rename = True
gm = GroundingMapper(grounding_map)
stmts_out = gm.map_agents(stmts_in, do_rename=do_rename)
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def map_sequence(stmts_in, **kwargs):
"""Map sequences using the SiteMapper.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to map.
do_methionine_offset : boolean
Whether to check for off-by-one errors in site position (possibly)
attributable to site numbering from mature proteins after
cleavage of the initial methionine. If True, checks the reference
sequence for a known modification at 1 site position greater
than the given one; if there exists such a site, creates the
mapping. Default is True.
do_orthology_mapping : boolean
Whether to check sequence positions for known modification sites
in mouse or rat sequences (based on PhosphoSitePlus data). If a
mouse/rat site is found that is linked to a site in the human
reference sequence, a mapping is created. Default is True.
do_isoform_mapping : boolean
Whether to check sequence positions for known modifications
in other human isoforms of the protein (based on PhosphoSitePlus
data). If a site is found that is linked to a site in the human
reference sequence, a mapping is created. Default is True.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of mapped statements.
"""
logger.info('Mapping sites on %d statements...' % len(stmts_in))
kwarg_list = ['do_methionine_offset', 'do_orthology_mapping',
'do_isoform_mapping']
sm = SiteMapper(default_site_map)
valid, mapped = sm.map_sites(stmts_in, **_filter(kwargs, kwarg_list))
correctly_mapped_stmts = []
for ms in mapped:
if all([True if mm[1] is not None else False
for mm in ms.mapped_mods]):
correctly_mapped_stmts.append(ms.mapped_stmt)
stmts_out = valid + correctly_mapped_stmts
logger.info('%d statements with valid sites' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def run_preassembly(stmts_in, **kwargs):
"""Run preassembly on a list of statements.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to preassemble.
return_toplevel : Optional[bool]
If True, only the top-level statements are returned. If False,
all statements are returned irrespective of level of specificity.
Default: True
poolsize : Optional[int]
The number of worker processes to use to parallelize the
comparisons performed by the function. If None (default), no
parallelization is performed. NOTE: Parallelization is only
available on Python 3.4 and above.
size_cutoff : Optional[int]
Groups with size_cutoff or more statements are sent to worker
processes, while smaller groups are compared in the parent process.
Default value is 100. Not relevant when parallelization is not
used.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
save_unique : Optional[str]
The name of a pickle file to save the unique statements into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of preassembled top-level statements.
"""
dump_pkl_unique = kwargs.get('save_unique')
be = BeliefEngine()
pa = Preassembler(hierarchies, stmts_in)
run_preassembly_duplicate(pa, be, save=dump_pkl_unique)
dump_pkl = kwargs.get('save')
return_toplevel = kwargs.get('return_toplevel', True)
poolsize = kwargs.get('poolsize', None)
size_cutoff = kwargs.get('size_cutoff', 100)
options = {'save': dump_pkl, 'return_toplevel': return_toplevel,
'poolsize': poolsize, 'size_cutoff': size_cutoff}
stmts_out = run_preassembly_related(pa, be, **options)
return stmts_out
[docs]def run_preassembly_duplicate(preassembler, beliefengine, **kwargs):
"""Run deduplication stage of preassembly on a list of statements.
Parameters
----------
preassembler : indra.preassembler.Preassembler
A Preassembler instance
beliefengine : indra.belief.BeliefEngine
A BeliefEngine instance
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of unique statements.
"""
logger.info('Combining duplicates on %d statements...' %
len(preassembler.stmts))
dump_pkl = kwargs.get('save')
stmts_out = preassembler.combine_duplicates()
beliefengine.set_prior_probs(stmts_out)
logger.info('%d unique statements' % len(stmts_out))
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_by_type(stmts_in, stmt_type, **kwargs):
"""Filter to a given statement type.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
stmt_type : indra.statements.Statement
The class of the statement type to filter for.
Example: indra.statements.Modification
invert : Optional[bool]
If True, the statements that are not of the given type
are returned. Default: False
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
invert = kwargs.get('invert', False)
logger.info('Filtering %d statements for type %s%s...' %
(len(stmts_in), 'not ' if invert else '',
stmt_type.__name__))
if not invert:
stmts_out = [st for st in stmts_in if isinstance(st, stmt_type)]
else:
stmts_out = [st for st in stmts_in if not isinstance(st, stmt_type)]
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_grounded_only(stmts_in, **kwargs):
"""Filter to statements that have grounded agents.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements for grounded agents...' %
len(stmts_in))
stmts_out = []
for st in stmts_in:
grounded = True
for agent in st.agent_list():
if agent is not None:
if (not agent.db_refs) or \
((len(agent.db_refs) == 1) and agent.db_refs.get('TEXT')):
grounded = False
break
if grounded:
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_genes_only(stmts_in, **kwargs):
"""Filter to statements containing genes only.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
specific_only : Optional[bool]
If True, only elementary genes/proteins will be kept and families
will be filtered out. If False, families are also included in the
output. Default: False
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
specific_only = kwargs.get('specific_only')
logger.info('Filtering %d statements for ones containing genes only...' %
len(stmts_in))
stmts_out = []
for st in stmts_in:
genes_only = True
for agent in st.agent_list():
if agent is not None:
if not specific_only:
if not(agent.db_refs.get('HGNC') or \
agent.db_refs.get('UP') or \
agent.db_refs.get('BE')):
genes_only = False
break
else:
if not(agent.db_refs.get('HGNC') or \
agent.db_refs.get('UP')):
genes_only = False
break
if genes_only:
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_belief(stmts_in, belief_cutoff, **kwargs):
"""Filter to statements with belief above a given cutoff.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
belief_cutoff : float
Only statements with belief above the belief_cutoff will be returned.
Here 0 < belief_cutoff < 1.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
dump_pkl = kwargs.get('save')
logger.info('Filtering %d statements to above %f belief' %
(len(stmts_in), belief_cutoff))
# The first round of filtering is in the top-level list
stmts_out = []
# Now we eliminate supports/supported-by
for stmt in stmts_in:
if stmt.belief >= belief_cutoff:
stmts_out.append(stmt)
else:
continue
supp_by = []
supp = []
for st in stmt.supports:
if st.belief >= belief_cutoff:
supp.append(st)
for st in stmt.supported_by:
if st.belief >= belief_cutoff:
supp_by.append(st)
stmt.supports = supp
stmt.supported_by = supp_by
logger.info('%d statements after filter...' % len(stmts_out))
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_gene_list(stmts_in, gene_list, policy, allow_families=False,
**kwargs):
"""Return statements that contain genes given in a list.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
gene_list : list[str]
A list of gene symbols to filter for.
policy : str
The policy to apply when filtering for the list of genes. "one": keep
statements that contain at least one of the list of genes and
possibly others not in the list "all": keep statements that only
contain genes given in the list
allow_families : Optional[bool]
Will include statements involving Bioentities families containing one
of the genes in the gene list. Default: False
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
if policy not in ('one', 'all'):
logger.error('Policy %s is invalid, not applying filter.' % policy)
else:
genes_str = ', '.join(gene_list)
logger.info('Filtering %d statements for ones containing "%s" of: '
'%s...' % (len(stmts_in), policy, genes_str))
# If we're allowing families, make a list of all Bioentities IDs that
# contain members of the gene list, and add them to the filter list
filter_list = copy(gene_list)
if allow_families:
for hgnc_name in gene_list:
gene_uri = hierarchies['entity'].get_uri('HGNC', hgnc_name)
parents = hierarchies['entity'].get_parents(gene_uri)
for par_uri in parents:
ns, id = hierarchies['entity'].ns_id_from_uri(par_uri)
filter_list.append(id)
stmts_out = []
if policy == 'one':
for st in stmts_in:
found_gene = False
for agent in st.agent_list():
if agent is not None:
if agent.name in filter_list:
found_gene = True
break
if found_gene:
stmts_out.append(st)
elif policy == 'all':
for st in stmts_in:
found_genes = True
for agent in st.agent_list():
if agent is not None:
if agent.name not in filter_list:
found_genes = False
break
if found_genes:
stmts_out.append(st)
else:
stmts_out = stmts_in
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_human_only(stmts_in, **kwargs):
"""Filter out statements that are not grounded to human genes.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
dump_pkl = kwargs.get('save')
logger.info('Filtering %d statements for human genes only...' %
len(stmts_in))
stmts_out = []
for st in stmts_in:
human_genes = True
for agent in st.agent_list():
if agent is not None:
upid = agent.db_refs.get('UP')
if upid and not uniprot_client.is_human(upid):
human_genes = False
break
if human_genes:
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_direct(stmts_in, **kwargs):
"""Filter to statements that are direct interactions
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
def get_is_direct(stmt):
"""Returns true if there is evidence that the statement is a direct
interaction.
If any of the evidences associated with the statement
indicates a direct interatcion then we assume the interaction
is direct. If there is no evidence for the interaction being indirect
then we default to direct.
"""
any_indirect = False
for ev in stmt.evidence:
if ev.epistemics.get('direct') is True:
return True
elif ev.epistemics.get('direct') is False:
# This guarantees that we have seen at least
# some evidence that the statement is indirect
any_indirect = True
if any_indirect:
return False
return True
logger.info('Filtering %d statements to direct ones...' % len(stmts_in))
stmts_out = []
for st in stmts_in:
if get_is_direct(st):
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_no_hypothesis(stmts_in, **kwargs):
"""Filter to statements that are not marked as hypothesis in epistemics.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements to no hypothesis...' % len(stmts_in))
stmts_out = []
for st in stmts_in:
all_hypotheses = True
ev = None
for ev in st.evidence:
if not ev.epistemics.get('hypothesis', False):
all_hypotheses = False
break
if ev is None:
all_hypotheses = False
if not all_hypotheses:
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_evidence_source(stmts_in, source_apis, policy='one', **kwargs):
"""Filter to statements that have evidence from a given set of sources.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
source_apis : list[str]
A list of sources to filter for. Examples: biopax, bel, reach
policy : Optional[str]
If 'one', a statement that hase evidence from any of the sources is
kept. If 'all', only those statements are kept which have evidence
from all the input sources specified in source_apis.
If 'none', only those statements are kept that don't have evidence
from any of the sources specified in source_apis.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements to evidence source "%s" of: %s...' %
(len(stmts_in), policy, ', '.join(source_apis)))
stmts_out = []
for st in stmts_in:
sources = set([ev.source_api for ev in st.evidence])
if policy == 'one':
if sources.intersection(source_apis):
stmts_out.append(st)
if policy == 'all':
if sources.intersection(source_apis) == set(source_apis):
stmts_out.append(st)
if policy == 'none':
if not sources.intersection(source_apis):
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_top_level(stmts_in, **kwargs):
"""Filter to statements that are at the top-level of the hierarchy.
Here top-level statements correspond to most specific ones.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements for top-level...' % len(stmts_in))
stmts_out = [st for st in stmts_in if not st.supports]
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_inconsequential_mods(stmts_in, whitelist=None, **kwargs):
"""Filter out Modifications that modify inconsequential sites
Inconsequential here means that the site is not mentioned / tested
in any other statement. In some cases specific sites should be
preserved, for instance, to be used as readouts in a model.
In this case, the given sites can be passed in a whitelist.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
whitelist : Optional[dict]
A whitelist containing agent modification sites whose
modifications should be preserved even if no other statement
refers to them. The whitelist parameter is a dictionary in which
the key is a gene name and the value is a list of tuples of
(modification_type, residue, position). Example:
whitelist = {'MAP2K1': [('phosphorylation', 'S', '222')]}
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
if whitelist is None:
whitelist = {}
logger.info('Filtering %d statements to remove' % len(stmts_in) +
' inconsequential modifications...')
states_used = whitelist
for stmt in stmts_in:
for agent in stmt.agent_list():
if agent is not None:
if agent.mods:
for mc in agent.mods:
mod = (mc.mod_type, mc.residue, mc.position)
try:
states_used[agent.name].append(mod)
except KeyError:
states_used[agent.name] = [mod]
for k, v in states_used.items():
states_used[k] = list(set(v))
stmts_out = []
for stmt in stmts_in:
skip = False
if isinstance(stmt, Modification):
mod_type = modclass_to_modtype[stmt.__class__]
if isinstance(stmt, RemoveModification):
mod_type = modtype_to_inverse[mod_type]
mod = (mod_type, stmt.residue, stmt.position)
used = states_used.get(stmt.sub.name, [])
if mod not in used:
skip = True
if not skip:
stmts_out.append(stmt)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_inconsequential_acts(stmts_in, whitelist=None, **kwargs):
"""Filter out Activations that modify inconsequential activities
Inconsequential here means that the site is not mentioned / tested
in any other statement. In some cases specific activity types should be
preserved, for instance, to be used as readouts in a model.
In this case, the given activities can be passed in a whitelist.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
whitelist : Optional[dict]
A whitelist containing agent activity types which should be preserved
even if no other statement refers to them.
The whitelist parameter is a dictionary in which
the key is a gene name and the value is a list of activity types.
Example: whitelist = {'MAP2K1': ['kinase']}
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
if whitelist is None:
whitelist = {}
logger.info('Filtering %d statements to remove' % len(stmts_in) +
' inconsequential activations...')
states_used = whitelist
for stmt in stmts_in:
for agent in stmt.agent_list():
if agent is not None:
if agent.activity:
act = agent.activity.activity_type
try:
states_used[agent.name].append(act)
except KeyError:
states_used[agent.name] = [act]
for k, v in states_used.items():
states_used[k] = list(set(v))
stmts_out = []
for stmt in stmts_in:
skip = False
if isinstance(stmt, RegulateActivity):
used = states_used.get(stmt.obj.name, [])
if stmt.obj_activity not in used:
skip = True
if not skip:
stmts_out.append(stmt)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_mutation_status(stmts_in, mutations, deletions, **kwargs):
"""Filter statements based on existing mutations/deletions
This filter helps to contextualize a set of statements to a given
cell type. Given a list of deleted genes, it removes statements that refer
to these genes. It also takes a list of mutations and removes statements
that refer to mutations not relevant for the given context.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
mutations : dict
A dictionary whose keys are gene names, and the values are lists of
tuples of the form (residue_from, position, residue_to).
Example: mutations = {'BRAF': [('V', '600', 'E')]}
deletions : list
A list of gene names that are deleted.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements for mutation status...' %
len(stmts_in))
stmts_out = []
for stmt in stmts_in:
skip = False
for agent in stmt.agent_list():
if agent is not None and agent.name in deletions:
skip = True
break
if agent is not None and agent.mutations:
muts = mutations.get(agent.name, [])
for mut in agent.mutations:
mut_tup = (mut.residue_from, mut.position, mut.residue_to)
if mut_tup not in muts:
skip = True
if skip:
break
if not skip:
stmts_out.append(stmt)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_enzyme_kinase(stmts_in, **kwargs):
"""Filter Phosphorylations to ones where the enzyme is a known kinase.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements to remove ' % len(stmts_in) +
'phosphorylation by non-kinases...')
path = os.path.dirname(os.path.abspath(__file__))
kinase_table = read_unicode_csv(path + '/../resources/kinases.tsv',
delimiter='\t')
gene_names = [lin[1] for lin in list(kinase_table)[1:]]
stmts_out = []
for st in stmts_in:
if isinstance(st, Phosphorylation):
if st.enz is not None:
if st.enz.name in gene_names:
stmts_out.append(st)
else:
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_mod_nokinase(stmts_in, **kwargs):
"""Filter non-phospho Modifications to ones with a non-kinase enzyme.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements to remove ' % len(stmts_in) +
'non-phospho modifications by kinases...')
path = os.path.dirname(os.path.abspath(__file__))
kinase_table = read_unicode_csv(path + '/../resources/kinases.tsv',
delimiter='\t')
gene_names = [lin[1] for lin in list(kinase_table)[1:]]
stmts_out = []
for st in stmts_in:
if isinstance(st, Modification) and not \
isinstance(st, Phosphorylation):
if st.enz is not None:
if st.enz.name not in gene_names:
stmts_out.append(st)
else:
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_transcription_factor(stmts_in, **kwargs):
"""Filter out RegulateAmounts where subject is not a transcription factor.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements to remove ' % len(stmts_in) +
'amount regulations by non-transcription-factors...')
path = os.path.dirname(os.path.abspath(__file__))
tf_table = \
read_unicode_csv(path + '/../resources/transcription_factors.csv')
gene_names = [lin[1] for lin in list(tf_table)[1:]]
stmts_out = []
for st in stmts_in:
if isinstance(st, RegulateAmount):
if st.subj is not None:
if st.subj.name in gene_names:
stmts_out.append(st)
else:
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def filter_uuid_list(stmts_in, uuids, **kwargs):
"""Filter to Statements corresponding to given UUIDs
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to filter.
uuids : list[str]
A list of UUIDs to filter for.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of filtered statements.
"""
logger.info('Filtering %d statements for %d UUID%s...' %
(len(stmts_in), len(uuids), 's' if len(uuids) > 1 else ''))
stmts_out = []
for st in stmts_in:
if st.uuid in uuids:
stmts_out.append(st)
logger.info('%d statements after filter...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def expand_families(stmts_in, **kwargs):
"""Expand Bioentities Agents to individual genes.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to expand.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of expanded statements.
"""
logger.info('Expanding families on %d statements...' % len(stmts_in))
expander = Expander(hierarchies)
stmts_out = expander.expand_families(stmts_in)
logger.info('%d statements after expanding families...' % len(stmts_out))
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def reduce_activities(stmts_in, **kwargs):
"""Reduce the activity types in a list of statements
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to reduce activity types in.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of reduced activity statements.
"""
logger.info('Reducing activities on %d statements...' % len(stmts_in))
stmts_out = [deepcopy(st) for st in stmts_in]
ml = MechLinker(stmts_out)
ml.gather_explicit_activities()
ml.reduce_activities()
stmts_out = ml.statements
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def strip_agent_context(stmts_in, **kwargs):
"""Strip any context on agents within each statement.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements whose agent context should be stripped.
save : Optional[str]
The name of a pickle file to save the results (stmts_out) into.
Returns
-------
stmts_out : list[indra.statements.Statement]
A list of stripped statements.
"""
logger.info('Stripping agent context on %d statements...' % len(stmts_in))
stmts_out = []
for st in stmts_in:
new_st = deepcopy(st)
for agent in new_st.agent_list():
if agent is None:
continue
agent.mods = []
agent.mutations = []
agent.activity = None
agent.location = None
agent.bound_conditions = []
stmts_out.append(new_st)
dump_pkl = kwargs.get('save')
if dump_pkl:
dump_statements(stmts_out, dump_pkl)
return stmts_out
[docs]def dump_stmt_strings(stmts, fname):
"""Save printed statements in a file.
Parameters
----------
stmts_in : list[indra.statements.Statement]
A list of statements to save in a text file.
fname : Optional[str]
The name of a text file to save the printed statements into.
"""
with open(fname, 'wb') as fh:
for st in stmts:
fh.write(('%s\n' % st).encode('utf-8'))
if __name__ == '__main__':
if len(sys.argv) < 3:
logger.error('Usage: assemble_corpus.py <pickle_file> <output_folder>')
sys.exit()
stmts_fname = sys.argv[1]
out_folder = sys.argv[2]
stmts = load_statements(stmts_fname)
logger.info('All statements: %d' % len(stmts))
cache_pkl = os.path.join(out_folder, 'mapped_stmts.pkl')
options = {'save': cache_pkl, 'do_rename': True}
stmts = map_grounding(stmts, **options)
cache_pkl = os.path.join(out_folder, 'sequence_valid_stmts.pkl')
options = {'save': cache_pkl}
mapped_stmts = map_sequence(stmts, **options)
be = BeliefEngine()
pa = Preassembler(hierarchies, mapped_stmts)
cache_pkl = os.path.join(out_folder, 'unique_stmts.pkl')
options = {'save': cache_pkl}
unique_stmts = run_preassembly_duplicate(pa, be, **options)
cache_pkl = os.path.join(out_folder, 'top_stmts.pkl')
options = {'save': cache_pkl}
stmts = run_preassembly_related(pa, be, **options)