import pickle
import logging
from indra.statements import Agent
import indra.tools.assemble_corpus as ac
from indra.databases import hgnc_client
from indra.ontology.bio import bio_ontology
logger = logging.getLogger(__name__)
[docs]class IncrementalModel(object):
"""Assemble a model incrementally by iteratively adding new Statements.
Parameters
----------
model_fname : Optional[str]
The name of the pickle file in which a set of INDRA Statements are
stored in a dict keyed by PubMed IDs. This is the state of an
IncrementalModel that is loaded upon instantiation.
Attributes
----------
stmts : dict[str, list[indra.statements.Statement]]
A dictionary of INDRA Statements keyed by PMIDs that stores the current
state of the IncrementalModel.
assembled_stmts : list[indra.statements.Statement]
A list of INDRA Statements after assembly.
"""
def __init__(self, model_fname=None):
if model_fname is None:
self.stmts = {}
else:
try:
with open(model_fname, 'rb') as f:
self.stmts = pickle.load(f)
except:
logger.warning('Could not load %s, starting new model.' %
model_fname)
self.stmts = {}
self.prior_genes = []
self.assembled_stmts = []
[docs] def save(self, model_fname='model.pkl'):
"""Save the state of the IncrementalModel in a pickle file.
Parameters
----------
model_fname : Optional[str]
The name of the pickle file to save the state of the
IncrementalModel in. Default: model.pkl
"""
with open(model_fname, 'wb') as fh:
pickle.dump(self.stmts, fh, protocol=4)
[docs] def add_statements(self, pmid, stmts):
"""Add INDRA Statements to the incremental model indexed by PMID.
Parameters
----------
pmid : str
The PMID of the paper from which statements were extracted.
stmts : list[indra.statements.Statement]
A list of INDRA Statements to be added to the model.
"""
if pmid not in self.stmts:
self.stmts[pmid] = stmts
else:
self.stmts[pmid] += stmts
def _relevance_filter(self, stmts, filters=None):
if filters is None:
return stmts
logger.info('Running relevance filter on %d statements' % len(stmts))
prior_agents = get_gene_agents(self.prior_genes)
if 'prior_all' in filters:
stmts = _ref_agents_all_filter(stmts, prior_agents)
elif 'prior_one' in filters:
stmts = _ref_agents_one_filter(stmts, prior_agents)
logger.info('%d statements after relevance filter' % len(stmts))
return stmts
[docs] def preassemble(self, filters=None, grounding_map=None):
"""Preassemble the Statements collected in the model.
Use INDRA's GroundingMapper, Preassembler and BeliefEngine
on the IncrementalModel and save the unique statements and
the top level statements in class attributes.
Currently the following filter options are implemented:
- grounding: require that all Agents in statements are grounded
- human_only: require that all proteins are human proteins
- prior_one: require that at least one Agent is in the prior model
- prior_all: require that all Agents are in the prior model
Parameters
----------
filters : Optional[list[str]]
A list of filter options to apply when choosing the statements.
See description above for more details. Default: None
grounding_map : Optional[dict]
A user supplied grounding map which maps a string to a
dictionary of database IDs (in the format used by Agents'
db_refs).
"""
stmts = self.get_statements()
# Filter out hypotheses
stmts = ac.filter_no_hypothesis(stmts)
# Fix grounding
if grounding_map is not None:
stmts = ac.map_grounding(stmts, grounding_map=grounding_map)
else:
stmts = ac.map_grounding(stmts)
if filters and ('grounding' in filters):
stmts = ac.filter_grounded_only(stmts)
# Fix sites
stmts = ac.map_sequence(stmts)
if filters and 'human_only' in filters:
stmts = ac.filter_human_only(stmts)
# Run preassembly
stmts = ac.run_preassembly(stmts, return_toplevel=False)
# Run relevance filter
stmts = self._relevance_filter(stmts, filters)
# Save Statements
self.assembled_stmts = stmts
[docs] def load_prior(self, prior_fname):
"""Load a set of prior statements from a pickle file.
The prior statements have a special key in the stmts dictionary
called "prior".
Parameters
----------
prior_fname : str
The name of the pickle file containing the prior Statements.
"""
self.stmts['prior'] = ac.load_statements(prior_fname)
[docs] def get_model_agents(self):
"""Return a list of all Agents from all Statements.
Returns
-------
agents : list[indra.statements.Agent]
A list of Agents that are in the model.
"""
model_stmts = self.get_statements()
agents = []
for stmt in model_stmts:
for a in stmt.agent_list():
if a is not None:
agents.append(a)
return agents
[docs] def get_statements(self):
"""Return a list of all Statements in a single list.
Returns
-------
stmts : list[indra.statements.Statement]
A list of all the INDRA Statements in the model.
"""
stmt_lists = [v for k, v in self.stmts.items()]
stmts = []
for s in stmt_lists:
stmts += s
return stmts
[docs] def get_statements_noprior(self):
"""Return a list of all non-prior Statements in a single list.
Returns
-------
stmts : list[indra.statements.Statement]
A list of all the INDRA Statements in the model (excluding
the prior).
"""
stmt_lists = [v for k, v in self.stmts.items() if k != 'prior']
stmts = []
for s in stmt_lists:
stmts += s
return stmts
[docs] def get_statements_prior(self):
"""Return a list of all prior Statements in a single list.
Returns
-------
stmts : list[indra.statements.Statement]
A list of all the INDRA Statements in the prior.
"""
if self.stmts.get('prior') is not None:
return self.stmts['prior']
return []
def _get_agent_comp(agent):
# FIXME: temporarily returning dummy component
return agent.name
def get_gene_agents(gene_names):
agents = []
for gn in gene_names:
hgnc_id = hgnc_client.get_hgnc_id(gn)
if not hgnc_id:
logger.warning('Invalid HGNC gene symbol: %s' % gn)
continue
db_refs = {'HGNC': hgnc_id}
up_id = hgnc_client.get_uniprot_id(hgnc_id)
if up_id:
db_refs['UP'] = up_id
agent = Agent(gn, db_refs=db_refs)
agents.append(agent)
return agents
def _ref_agents_all_filter(stmts_in, ref_agents):
# If there is no reference, keep everything by default
if not ref_agents:
return stmts_in
stmts_out = []
# Preprocess reference Agents: make a list of entity hierarchy components
# that appear in the reference and also a list of reference Agent names
ref_agent_names = set()
ref_components = set()
for a in ref_agents:
comp_id = _get_agent_comp(a)
if comp_id is not None:
ref_components.add(comp_id)
ref_agent_names.add(a.name)
# Iterate over every Statement and check if any of its Agents are either
# in a component appearing in the reference, or match one of the
# reference Agents that isn't in any of the components.
for st in stmts_in:
agents = [a for a in st.agent_list() if a is not None]
found_all = True
for st_agent in agents:
found = False
comp_id = _get_agent_comp(st_agent)
if comp_id is None:
for ref_agent_name in ref_agent_names:
if st_agent.name == ref_agent_name:
found = True
elif comp_id in ref_components:
found = True
if not found:
found_all = False
break
if found_all:
stmts_out.append(st)
return stmts_out
def _ref_agents_one_filter(stmts_in, ref_agents):
# If there is no reference, keep everything by default
if not ref_agents:
return stmts_in
stmts_out = []
# Preprocess reference Agents: make a list of entity hierarchy components
# that appear in the reference and also a list of reference Agent names
ref_agent_names = set()
ref_components = set()
for a in ref_agents:
comp_id = _get_agent_comp(a)
if comp_id is not None:
ref_components.add(comp_id)
ref_agent_names.add(a.name)
# Iterate over every Statement and check if any of its Agents are either
# in a component appearing in the reference, or match one of the
# reference Agents that isn't in any of the components.
for st in stmts_in:
agents = [a for a in st.agent_list() if a is not None]
found = False
for st_agent in agents:
comp_id = _get_agent_comp(st_agent)
if comp_id is None:
for ref_agent_name in ref_agent_names:
if st_agent.name == ref_agent_name:
found = True
break
elif comp_id in ref_components:
found = True
break
if found:
stmts_out.append(st)
return stmts_out
def _agent_related(a1, a2):
if a1.matches(a2) or a1.isa(a2, bio_ontology) or \
a2.isa(a1, bio_ontology):
return True
return False