Source code for indra.tools.incremental_model

import pickle
import logging
from indra.statements import Agent
import indra.tools.assemble_corpus as ac
from indra.databases import hgnc_client
from indra.ontology.bio import bio_ontology

logger = logging.getLogger(__name__)


[docs]class IncrementalModel(object): """Assemble a model incrementally by iteratively adding new Statements. Parameters ---------- model_fname : Optional[str] The name of the pickle file in which a set of INDRA Statements are stored in a dict keyed by PubMed IDs. This is the state of an IncrementalModel that is loaded upon instantiation. Attributes ---------- stmts : dict[str, list[indra.statements.Statement]] A dictionary of INDRA Statements keyed by PMIDs that stores the current state of the IncrementalModel. assembled_stmts : list[indra.statements.Statement] A list of INDRA Statements after assembly. """ def __init__(self, model_fname=None): if model_fname is None: self.stmts = {} else: try: with open(model_fname, 'rb') as f: self.stmts = pickle.load(f) except: logger.warning('Could not load %s, starting new model.' % model_fname) self.stmts = {} self.prior_genes = [] self.assembled_stmts = []
[docs] def save(self, model_fname='model.pkl'): """Save the state of the IncrementalModel in a pickle file. Parameters ---------- model_fname : Optional[str] The name of the pickle file to save the state of the IncrementalModel in. Default: model.pkl """ with open(model_fname, 'wb') as fh: pickle.dump(self.stmts, fh, protocol=4)
[docs] def add_statements(self, pmid, stmts): """Add INDRA Statements to the incremental model indexed by PMID. Parameters ---------- pmid : str The PMID of the paper from which statements were extracted. stmts : list[indra.statements.Statement] A list of INDRA Statements to be added to the model. """ if pmid not in self.stmts: self.stmts[pmid] = stmts else: self.stmts[pmid] += stmts
def _relevance_filter(self, stmts, filters=None): if filters is None: return stmts logger.info('Running relevance filter on %d statements' % len(stmts)) prior_agents = get_gene_agents(self.prior_genes) if 'prior_all' in filters: stmts = _ref_agents_all_filter(stmts, prior_agents) elif 'prior_one' in filters: stmts = _ref_agents_one_filter(stmts, prior_agents) logger.info('%d statements after relevance filter' % len(stmts)) return stmts
[docs] def preassemble(self, filters=None, grounding_map=None): """Preassemble the Statements collected in the model. Use INDRA's GroundingMapper, Preassembler and BeliefEngine on the IncrementalModel and save the unique statements and the top level statements in class attributes. Currently the following filter options are implemented: - grounding: require that all Agents in statements are grounded - human_only: require that all proteins are human proteins - prior_one: require that at least one Agent is in the prior model - prior_all: require that all Agents are in the prior model Parameters ---------- filters : Optional[list[str]] A list of filter options to apply when choosing the statements. See description above for more details. Default: None grounding_map : Optional[dict] A user supplied grounding map which maps a string to a dictionary of database IDs (in the format used by Agents' db_refs). """ stmts = self.get_statements() # Filter out hypotheses stmts = ac.filter_no_hypothesis(stmts) # Fix grounding if grounding_map is not None: stmts = ac.map_grounding(stmts, grounding_map=grounding_map) else: stmts = ac.map_grounding(stmts) if filters and ('grounding' in filters): stmts = ac.filter_grounded_only(stmts) # Fix sites stmts = ac.map_sequence(stmts) if filters and 'human_only' in filters: stmts = ac.filter_human_only(stmts) # Run preassembly stmts = ac.run_preassembly(stmts, return_toplevel=False) # Run relevance filter stmts = self._relevance_filter(stmts, filters) # Save Statements self.assembled_stmts = stmts
[docs] def load_prior(self, prior_fname): """Load a set of prior statements from a pickle file. The prior statements have a special key in the stmts dictionary called "prior". Parameters ---------- prior_fname : str The name of the pickle file containing the prior Statements. """ self.stmts['prior'] = ac.load_statements(prior_fname)
[docs] def get_model_agents(self): """Return a list of all Agents from all Statements. Returns ------- agents : list[indra.statements.Agent] A list of Agents that are in the model. """ model_stmts = self.get_statements() agents = [] for stmt in model_stmts: for a in stmt.agent_list(): if a is not None: agents.append(a) return agents
[docs] def get_statements(self): """Return a list of all Statements in a single list. Returns ------- stmts : list[indra.statements.Statement] A list of all the INDRA Statements in the model. """ stmt_lists = [v for k, v in self.stmts.items()] stmts = [] for s in stmt_lists: stmts += s return stmts
[docs] def get_statements_noprior(self): """Return a list of all non-prior Statements in a single list. Returns ------- stmts : list[indra.statements.Statement] A list of all the INDRA Statements in the model (excluding the prior). """ stmt_lists = [v for k, v in self.stmts.items() if k != 'prior'] stmts = [] for s in stmt_lists: stmts += s return stmts
[docs] def get_statements_prior(self): """Return a list of all prior Statements in a single list. Returns ------- stmts : list[indra.statements.Statement] A list of all the INDRA Statements in the prior. """ if self.stmts.get('prior') is not None: return self.stmts['prior'] return []
def _get_agent_comp(agent): # FIXME: temporarily returning dummy component return agent.name def get_gene_agents(gene_names): agents = [] for gn in gene_names: hgnc_id = hgnc_client.get_hgnc_id(gn) if not hgnc_id: logger.warning('Invalid HGNC gene symbol: %s' % gn) continue db_refs = {'HGNC': hgnc_id} up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id agent = Agent(gn, db_refs=db_refs) agents.append(agent) return agents def _ref_agents_all_filter(stmts_in, ref_agents): # If there is no reference, keep everything by default if not ref_agents: return stmts_in stmts_out = [] # Preprocess reference Agents: make a list of entity hierarchy components # that appear in the reference and also a list of reference Agent names ref_agent_names = set() ref_components = set() for a in ref_agents: comp_id = _get_agent_comp(a) if comp_id is not None: ref_components.add(comp_id) ref_agent_names.add(a.name) # Iterate over every Statement and check if any of its Agents are either # in a component appearing in the reference, or match one of the # reference Agents that isn't in any of the components. for st in stmts_in: agents = [a for a in st.agent_list() if a is not None] found_all = True for st_agent in agents: found = False comp_id = _get_agent_comp(st_agent) if comp_id is None: for ref_agent_name in ref_agent_names: if st_agent.name == ref_agent_name: found = True elif comp_id in ref_components: found = True if not found: found_all = False break if found_all: stmts_out.append(st) return stmts_out def _ref_agents_one_filter(stmts_in, ref_agents): # If there is no reference, keep everything by default if not ref_agents: return stmts_in stmts_out = [] # Preprocess reference Agents: make a list of entity hierarchy components # that appear in the reference and also a list of reference Agent names ref_agent_names = set() ref_components = set() for a in ref_agents: comp_id = _get_agent_comp(a) if comp_id is not None: ref_components.add(comp_id) ref_agent_names.add(a.name) # Iterate over every Statement and check if any of its Agents are either # in a component appearing in the reference, or match one of the # reference Agents that isn't in any of the components. for st in stmts_in: agents = [a for a in st.agent_list() if a is not None] found = False for st_agent in agents: comp_id = _get_agent_comp(st_agent) if comp_id is None: for ref_agent_name in ref_agent_names: if st_agent.name == ref_agent_name: found = True break elif comp_id in ref_components: found = True break if found: stmts_out.append(st) return stmts_out def _agent_related(a1, a2): if a1.matches(a2) or a1.isa(a2, bio_ontology) or \ a2.isa(a1, bio_ontology): return True return False