Source code for indra.preassembler.refinement

"""This module implements classes and functions that are used for
finding refinements between INDRA Statements as part of the
knowledge-assembly process. These are imported by the preassembler
__all__ = ['get_agent_key', 'get_relevant_keys', 'RefinementFilter',
           'RefinementConfirmationFilter', 'OntologyRefinementFilter',
           'SplitGroupFilter', 'default_refinement_fun']

import time
import logging
import collections
from indra.statements import Event
from indra.statements import stmt_type as indra_stmt_type

logger = logging.getLogger(__name__)

# TODO: we could make the agent key function parameterizable with the
# preassembler to allow custom agent mappings to the ontology.
[docs]def get_agent_key(agent): """Return a key for an Agent for use in refinement finding. Parameters ---------- agent : indra.statements.Agent or None An INDRA Agent whose key should be returned. Returns ------- tuple or None The key that maps the given agent to the ontology, with special handling for ungrounded and None Agents. """ if isinstance(agent, Event): agent = agent.concept if agent is None: agent_key = None else: agent_key = agent.get_grounding() if not agent_key[0]: agent_key = ('NAME', return agent_key
[docs]def get_relevant_keys(agent_key, all_keys_for_role, ontology, direction): """Return relevant agent keys for an agent key for refinement finding. Parameters ---------- agent_key : tuple or None An agent key of interest. all_keys_for_role : set The set of all agent keys in a given statement corpus with a role matching that of the given agent_key. ontology : indra.ontology.IndraOntology An IndraOntology instance with respect to which relevant other agent keys are found for the purposes of refinement. direction: str The direction in which to find relevant agents. The two options are 'less_specific' and 'more_specific' for agents that are less and more specific, per the ontology, respectively. Returns ------- set The set of relevant agent keys which this given agent key can possibly refine. """ rel_fun = ontology.get_parents if direction == 'less_specific' else \ ontology.get_children relevant_keys = {None, agent_key} if agent_key is not None: relevant_keys |= set(rel_fun(*agent_key)) relevant_keys &= all_keys_for_role return relevant_keys
[docs]class RefinementFilter: """A filter which is applied to one or more statements to eliminate candidate refinements that are not possible according to some criteria. By applying a series of such filters, the preassembler can avoid doing n-by-n comparisons to determine refinements among n statements. The filter class can take any number of constructor arguments that it needs to perform its task. The base class' constructor initializes a shared_data attribute as an empty dict. It also needs to implement an initialize function which is called with a stmts_by_hash argument, containing a dict of statements keyed by hash. This function can build any data structures that may be needed to efficiently apply the filter later. It cab store any such data structures in the shared_data dict to be accessed by other functions later. Finally, the class needs to implement a get_related function, which takes a single INDRA Statement as input to return the hashes of potentially related other statements that the filter was initialized with. The function also needs to take a possibly_related argument which is either None (no other filter was run before) or a set, which is the superset of possible relations as determined by some other previously applied filter. """ def __init__(self): self.shared_data = {}
[docs] def initialize(self, stmts_by_hash): """Initialize the filter class with a set of statements. The filter can build up some useful data structures in this function before being applied to any specific statements. Parameters ---------- stmts_by_hash : dict[int, indra.statements.Statement] A dict of statements keyed by their hashes. """ self.shared_data['stmts_by_hash'] = stmts_by_hash
[docs] def get_more_specifics(self, stmt, possibly_related=None): """Return a set of hashes of statements that are potentially related and more specific than the given statement.""" return self.get_related(stmt, possibly_related=possibly_related, direction='more_specific')
[docs] def get_less_specifics(self, stmt, possibly_related=None): """Return a set of hashes of statements that are potentially related and less specific than the given statement.""" return self.get_related(stmt, possibly_related=possibly_related, direction='less_specific')
[docs] def extend(self, stmts_by_hash): """Extend the initial data structures with a set of new statements. Parameters ---------- stmts_by_hash : dict[int, indra.statements.Statement] A dict of statements keyed by their hashes. """ # We can assume that these stmts_by_hash are unique self.shared_data['stmts_by_hash'].update(stmts_by_hash)
[docs]class OntologyRefinementFilter(RefinementFilter): """This filter uses an ontology to position statements and their agents to filter down significantly on the set of possible relations for a given statement. Parameters ---------- ontology : indra.ontology.OntologyGraph An INDRA ontology graph. """ def __init__(self, ontology): super().__init__() self.ontology = ontology
[docs] def initialize(self, stmts_by_hash): self.shared_data['stmts_by_hash'] = {} self.extend(stmts_by_hash)
[docs] def extend(self, stmts_by_hash): self.shared_data['stmts_by_hash'].update(stmts_by_hash) # Build up data structure of statement hashes by # statement type stmts_by_type = collections.defaultdict(set) for stmt_hash, stmt in stmts_by_hash.items(): stmts_by_type[indra_stmt_type(stmt)].add(stmt_hash) stmts_by_type = dict(stmts_by_type) # Now iterate over each statement type and build up # data structures for quick filtering for stmt_type, stmts_this_type in stmts_by_type.items(): # Step 1. initialize data structures # noinspection PyProtectedMember roles = stmts_by_hash[next(iter(stmts_this_type))]._agent_order if stmt_type not in self.shared_data: self.shared_data[stmt_type] = {} # Mapping agent keys to statement hashes self.shared_data[stmt_type]['agent_key_to_hash'] = \ {role: collections.defaultdict(set) for role in roles} # Mapping statement hashes to agent keys self.shared_data[stmt_type]['hash_to_agent_key'] = \ {role: collections.defaultdict(set) for role in roles} # All agent keys for a given agent role self.shared_data[stmt_type]['all_keys_by_role'] = {} # Step 2. Fill up the initial data structures in preparation # for identifying potential refinements for sh in stmts_this_type: for role in roles: agent_keys = self._agent_keys_for_stmt_role( stmts_by_hash[sh], role) for agent_key in agent_keys: self.shared_data[stmt_type]['agent_key_to_hash'][ role][agent_key].add(sh) self.shared_data[stmt_type]['hash_to_agent_key'][ role][sh].add(agent_key) for role in roles: self.shared_data[stmt_type]['all_keys_by_role'][role] = \ set(self.shared_data[stmt_type]['agent_key_to_hash'][role])
@staticmethod def _agent_keys_for_stmt_role(stmt, role): """Return a set of agent keys for a statement's agent in a role. The agent key is an "anchor" to the ontology being used and positons a statement, via its agent in this role against other statements it may be related to. """ agents = getattr(stmt, role) # Handle a special case here where a list=like agent # role can be empty, here we will consider anything else # to be a refinement, hence add a None key if isinstance(agents, list) and not agents: agent_keys = {None} # Generally, we take all the agent keys for a single or # list-like agent role. else: agent_keys = {get_agent_key(agent) for agent in (agents if isinstance(agents, list) else [agents])} return agent_keys
[docs]class RefinementConfirmationFilter(RefinementFilter): """This class runs the refinement function between potentially related statements to confirm whether they are indeed, conclusively in a refinement relationship with each other. In this sense, this isn't a real filter, though implementing it as one is convenient. This filter is meant to be used as the final component in a series of pre-filters. """ def __init__(self, ontology, refinement_fun=None): self.ontology = ontology self.refinement_fun = refinement_fun if refinement_fun else \ default_refinement_fun self.shared_data = {} self.comparison_counter = 0
[docs]class SplitGroupFilter(RefinementFilter): """This filter implements splitting statements into two groups and only considering refinement relationships between the groups but not within them.""" def __init__(self, split_groups): super().__init__() self.split_groups = split_groups
def default_refinement_fun(st1, st2, ontology, entities_refined): return st1.refinement_of(st2, ontology, entities_refined)