Source code for indra.ontology.standardize

__all__ = ['standardize_agent_name', 'standardize_db_refs', 'get_standard_name',
           'standardize_name_db_refs', 'get_standard_agent']

import logging
from copy import deepcopy
from collections import defaultdict
from indra.statements.agent import default_ns_order, get_grounding, Agent
from indra.statements.validate import assert_valid_db_refs

logger = logging.getLogger(__name__)


default_ns_priorities = {ns: idx for idx, ns in enumerate(default_ns_order)}


def prioritize(ns1, ns2, ns_order=None):
    ns_priorities = {ns: idx for idx, ns in enumerate(ns_order)} \
        if ns_order is not None else default_ns_priorities
    ns1p = ns_priorities.get(ns1)
    ns2p = ns_priorities.get(ns2)
    if ns2p is not None and (ns1p is None or ns2p < ns1p):
        return True
    return False


def _get_mappings_dict(mappings):
    md = defaultdict(list)
    for db_ns, db_id in mappings:
        md[db_ns].append(db_id)
    return md


[docs]def get_standard_agent(name, db_refs, ontology=None, ns_order=None, **kwargs):
    """Get a standard agent based on the name, db_refs, and a any other kwargs.

    name : str
        The name of the agent that may not be standardized.
    db_refs : dict
        A dict of db refs that may not be standardized, i.e., may be
        missing an available UP ID corresponding to an existing HGNC ID.
    ontology : Optional[indra.ontology.IndraOntology]
        An IndraOntology object, if not provided, the default BioOntology
        is used.
    ns_order : Optional[list]
        A list of namespaces which are in order of priority with higher
        priority namespaces appearing earlier in the list.
    kwargs :
        Keyword arguments to pass to :func:`Agent.__init__`.

    Returns
    -------
    Agent
        A standard agent
    """
    standard_name, db_refs = standardize_name_db_refs(db_refs,
                                                      ontology=ontology,
                                                      ns_order=ns_order)
    if standard_name:
        name = standard_name
    assert_valid_db_refs(db_refs)
    return Agent(name, db_refs=db_refs, **kwargs)


[docs]def standardize_db_refs(db_refs, ontology=None, ns_order=None):
    """Return a standardized db refs dict for a given db refs dict.

    Parameters
    ----------
    db_refs : dict
        A dict of db refs that may not be standardized, i.e., may be
        missing an available UP ID corresponding to an existing HGNC ID.
    ontology : Optional[indra.ontology.IndraOntology]
        An IndraOntology object, if not provided, the default BioOntology
        is used.
    ns_order : Optional[list]
        A list of namespaces which are in order of priority with higher
        priority namespaces appearing earlier in the list.

    Returns
    -------
    dict
        The db_refs dict with standardized entries.
    """
    if ontology is None:
        from indra.ontology.bio import bio_ontology
        ontology = bio_ontology

    # We iterate over all the db_refs entries that currently exist
    for source_db_ns, source_db_id in deepcopy(db_refs).items():
        source_db_id = _preprocess_for_mapping(source_db_ns, source_db_id)
        # If there is a replacement for this entry, we apply it
        replacement = ontology.get_replacement(source_db_ns, source_db_id)
        if replacement:
            source_db_ns, source_db_id = replacement
            db_refs[source_db_ns] = source_db_id
        # For the entry we get all its xref mappings as a list
        # of tuples and turn it into a dict keyed by namespace
        mappings = _get_mappings_dict(
            ontology.get_mappings(source_db_ns, source_db_id))
        # We iterate over these mappings and check if they should
        # be applied
        for mapped_db_ns, mapped_db_ids in mappings.items():
            # If the db_refs doesn't yet contain a mapping for this
            # name space then we always add this mapping. If there
            # is already an entry for this name space then
            # we overwrite it if the source name space is higher
            # priority than the name space being mapped to.
            if mapped_db_ns not in db_refs or \
                    prioritize(mapped_db_ns, source_db_ns,
                               ns_order=ns_order):
                db_refs[mapped_db_ns] = sorted(mapped_db_ids)[0]
    return db_refs


def _preprocess_for_mapping(db_ns, db_id):
    if db_ns == 'UP' and db_id is not None and '-' in db_id:
        return db_id.split('-')[0]
    return db_id


[docs]def standardize_name_db_refs(db_refs, ontology=None, ns_order=None):
    """Return a standardized name and db refs dict for a given db refs dict.

    Parameters
    ----------
    db_refs : dict
        A dict of db refs that may not be standardized, i.e., may be
        missing an available UP ID corresponding to an existing HGNC ID.
    ontology : Optional[indra.ontology.IndraOntology]
        An IndraOntology object, if not provided, the default BioOntology
        is used.
    ns_order : Optional[list]
        A list of namespaces which are in order of priority with higher
        priority namespaces appearing earlier in the list.

    Returns
    -------
    str or None
        The standard name based on the db refs, None if not available.
    dict
        The db_refs dict with standardized entries.
    """
    db_refs = standardize_db_refs(db_refs, ontology=ontology,
                                  ns_order=ns_order)
    name = get_standard_name(db_refs, ontology=ontology, ns_order=ns_order)
    return name, db_refs


[docs]def get_standard_name(db_refs, ontology=None, ns_order=None):
    """Return a standardized name for a given db refs dict.

    Parameters
    ----------
    db_refs : dict
        A dict of db refs that may not be standardized, i.e., may be
        missing an available UP ID corresponding to an existing HGNC ID.
    ontology : Optional[indra.ontology.IndraOntology]
        An IndraOntology object, if not provided, the default BioOntology
        is used.
    ns_order : Optional[list]
        A list of namespaces which are in order of priority with higher
        priority namespaces appearing earlier in the list.

    Returns
    -------
    str or None
        The standard name based on the db refs, None if not available.
    """
    if ontology is None:
        from indra.ontology.bio import bio_ontology
        ontology = bio_ontology

    # We next look for prioritized grounding, if missing, we return
    db_ns, db_id = get_grounding(db_refs, ns_order=ns_order)

    # If there's no grounding then we can't do more to standardize the
    # name and return
    if not db_ns or not db_id:
        return None

    # If there is grounding available, we can try to get the standardized name
    # and in the rare case that we don't get it, we don't set it.
    standard_name = ontology.get_name(db_ns, db_id)
    # Handle special case with UPPRO, if we can't get a feature name
    # we fall back on regular gene/protein naming
    if not standard_name and db_ns == 'UPPRO':
        db_ns, db_id = get_grounding(db_refs, ns_order=['HGNC', 'UP'])
        if not db_ns or not db_id:
            return None
        standard_name = ontology.get_name(db_ns, db_id)
    if not standard_name:
        return None

    return standard_name


[docs]def standardize_agent_name(agent, standardize_refs=True, ontology=None,
                           ns_order=None):
    """Standardize the name of an Agent based on grounding information.

    The priority of which namespace is used as the bases for the
    standard name depends on

    Parameters
    ----------
    agent : indra.statements.Agent
        An INDRA Agent whose name attribute should be standardized based
        on grounding information.
    standardize_refs : Optional[bool]
        If True, this function assumes that the Agent's db_refs need to
        be standardized, e.g., HGNC mapped to UP.
        Default: True
    ontology : Optional[indra.ontology.IndraOntology]
        An IndraOntology object, if not provided, the default BioOntology
        is used.
    ns_order : Optional[list]
        A list of namespaces which are in order of priority with higher
        priority namespaces appearing earlier in the list.

    Returns
    -------
    bool
        True if a new name was set, False otherwise.
    """
    # If the Agent is None, we return immediately
    if agent is None:
        return False
    # If we want to standardize the Agent's db_refs, we call this now
    if standardize_refs:
        agent.db_refs = standardize_db_refs(agent.db_refs, ontology=ontology)
    # We next try to get a standard name based on the Agent's grounding
    standard_name = get_standard_name(agent.db_refs, ontology=ontology,
                                      ns_order=ns_order)
    # If we got a proper standard name, we apply it
    if standard_name:
        agent.name = standard_name
        return True
    return False