Source code for indra.sources.minerva.processor

import logging
from indra.ontology.standardize import get_standard_name
from indra.ontology.bio import bio_ontology
from indra.statements import *
from .minerva_client import get_ids_to_refs, default_map_name
from .id_mapping import indra_db_refs_from_minerva_refs


logger = logging.getLogger(__name__)


[docs]class SifProcessor:
    """Processor that extracts INDRA Statements from SIF strings.

    Parameters
    ----------
    model_id_to_sif_strs : dict
        A dictionary mapping a model ID (int) to a list of strings in SIF
        format. Example: {799: ['csa2 POSITIVE sa9', 'csa11 NEGATIVE sa30']}
    map_name : str
        A name of a disease map to process.

    Attributes
    ----------
    statements : list[indra.statements.Statement]
        A list of INDRA Statements extracted from the SIF strings.
    """
    def __init__(self, model_id_to_sif_strs, map_name=default_map_name):
        self.model_id_to_sif_strs = model_id_to_sif_strs
        self.map_name = map_name
        self.statements = []

    def extract_statements(self):
        for model_id, sif_strs in self.model_id_to_sif_strs.items():
            self.statements += self.process_model(model_id, sif_strs)
        logger.info('Got %d total statements from %d models'
                    % (len(self.statements), len(self.model_id_to_sif_strs)))

    def process_model(self, model_id, sif_strs):
        logger.info('Processing model %d' % model_id)
        ids_to_refs, complex_members = get_ids_to_refs(model_id, self.map_name)
        stmts = []
        for sif_str in sif_strs:
            stmt = self.get_stmt(sif_str, ids_to_refs, complex_members,
                                 model_id)
            if stmt:
                stmts.append(stmt)
        logger.info('Got %d statements from model %d' % (len(stmts), model_id))
        return stmts

    def get_stmt(self, sif_str, ids_to_refs, complex_members, model_id):
        if sif_str.startswith('#') or sif_str == '':
            return
        clean_str = sif_str.strip('\n')
        subj_id, rel_type, obj_id = clean_str.split(' ')
        subj = get_agent(subj_id, ids_to_refs, complex_members)
        obj = get_agent(obj_id, ids_to_refs, complex_members)
        if rel_type == 'POSITIVE':
            stmt = Activation(subj, obj)
        elif rel_type == 'NEGATIVE':
            stmt = Inhibition(subj, obj)
        else:
            raise ValueError('Unknown relation type: %s' % rel_type)
        evid = Evidence(source_api='minerva',
                        annotations={'sif_str': sif_str,
                                     'minerva_model_id': model_id})
        stmt.evidence = [evid]
        return stmt


[docs]def get_agent(element_id, ids_to_refs, complex_members):
    """Get an agent for a MINERVA element.

    Parameters
    ----------
    element_id : str
        ID of an element used in MINERVA API and raw SIF files.
    ids_to_refs : dict
        A dictionary mapping element IDs to MINERVA provided references. Note
        that this mapping is unique per model (same IDs can be mapped to
        different refs in different models).
    complex_members : dict
        A dictionary mapping element ID of a complex element to element IDs of
        its members.

    Returns
    -------
    agent : indra.statements.agent.Agent
        INDRA agent created from given refs.
    """
    # Get references from MINERVA and filter to accepted namespaces
    exclude_ns = {'WIKIPATHWAYS', 'PUBMED', 'HGNC_SYMBOL', 'INTACT', 'PDB',
                  'DOI'}
    refs = ids_to_refs.get(element_id)
    db_refs = indra_db_refs_from_minerva_refs(refs)
    filtered_refs = {db_ns: db_id for (db_ns, db_id) in db_refs.items()
                     if db_ns not in exclude_ns}
    # If it's a complex and doesn't have complex level grounding
    if element_id in complex_members and len(filtered_refs) == 1:
        # Sort to always have the same main agent
        member_ids = complex_members[element_id]
        agents = [get_agent(member_id, ids_to_refs, complex_members)
                  for member_id in member_ids]
        agents = sorted(agents, key=lambda ag: ag.name)
        # Try to get a FamPlex family
        fam = get_family(agents)
        if fam:
            # Combine TEXT from MINERVA and found FPLX ID
            filtered_refs['FPLX'] = fam
            return get_agent_from_refs(filtered_refs)
        # Otherwise treat a list of agents as an agent with bound conditions
        else:
            main_agent = agents[0]
            if len(agents) > 1:
                for ag in agents[1:]:
                    main_agent.bound_conditions.append(BoundCondition(ag))
            return main_agent
    # Now we have either individual agents or complexes with complex level
    # grounding (e.g. from GO, MESH, UNIPROT)
    else:
        return get_agent_from_refs(filtered_refs)


[docs]def get_family(agents):
    """Get a FamPlex family if all of its members are given."""
    family_sets = []
    ag_groundings = []
    for ag in agents:
        gr = ag.get_grounding()
        ag_groundings.append(gr)
        parents = bio_ontology.get_parents(*gr)
        families = {p for p in parents if p[0] == 'FPLX'}
        family_sets.append(families)
    common_families = family_sets[0].intersection(*family_sets)
    if not common_families:
        return
    for fam in common_families:
        children = bio_ontology.get_children(*fam)
        # Check if all family members are present
        if set(children) == set(ag_groundings):
            return fam[1]


[docs]def get_agent_from_refs(db_refs):
    """Get an agent given its db_refs."""
    name = get_standard_name(db_refs)
    if not name:
        name = db_refs.get('TEXT')
    if name and db_refs:
        return Agent(name, db_refs=db_refs)