Source code for indra.sources.phosphoelm.processor

import logging
from tqdm import tqdm

import gilda
from indra.databases import uniprot_client, hgnc_client
from indra.statements.validate import validate_text_refs
from indra.statements import Phosphorylation, Evidence, Agent

from .phosphoelm_mapping import phosphoelm_mapping

logger = logging.getLogger(__name__)


[docs]class PhosphoElmProcessor(object):
    """Processes data dumps from the phospho.ELM database.

    See http://phospho.elm.eu.org/dataset.html

    Parameters
    ----------
    phosphoelm_data : list[dict]
        JSON compatible list of entries from a phospho.ELM data dump

    Attributes
    ----------
    statements : list[indra.statements.Phosphorylation]
        A list of the phosphorylation statements produced by the entries
        in phosphoelm_data
    """
    def __init__(self, phosphoelm_data):
        self.statements = []
        self._phosphoelm_data = phosphoelm_data

[docs]    def process_phosphorylations(self, skip_empty=True):
        """Create Phosphorylation statements from phosphoelm_data

        Parameters
        ----------
        skip_empty : bool
            Default: True. If False, also create statements when upstream
            kinases in entry['kinases'] are not known.
        """
        logger.info("Processing Phospho.ELM phosphorylations")
        for entry in tqdm(self._phosphoelm_data):
            if entry['species'].lower() != 'homo sapiens' or\
                    skip_empty and not entry['kinases']:
                # Skip entries without any kinases or if species is other
                # than human.
                continue
            # Entries:
            # 'acc': '<UP ID>', <-- substrate
            # 'sequence': '<protein sequence>',
            # 'position': '<sequence position>',
            # 'code': '<phosphorylated residue>',
            # 'pmids': '<pmid>',
            # 'kinases': '<responsible kinase>', <-- enzyme
            # 'source': 'HTP|LTP',
            # 'species': '<species name in latin>',
            # 'entry_date': 'yyyy-mm-dd HH:MM:SS.mmmmmm'
            substrate = _agent_from_id(entry['acc'])
            enzyme = _agent_from_str(entry['kinases'])

            # Skip if enz is None instead of an Agent (only when we skip
            # empty kinase entries)
            if skip_empty and enzyme is None:
                continue

            pmid = entry['pmids']
            if not validate_text_refs({'PMID': pmid}):
                pmid = None

            # Build evidence, add statement
            evidence = Evidence(
                source_api='phosphoelm',
                pmid=pmid,
                annotations={
                    'data_source': entry.get('source'),
                    'phosphoelm_substrate_id': entry['acc'],
                    'phosphoelm_kinase_name': entry.get('kinases'),
                    'entry_date': entry['entry_date'],
                    'sequence': entry['sequence']
                }
            )
            self.statements.append(Phosphorylation(
                enz=enzyme,
                sub=substrate,
                residue=entry['code'],
                position=entry['position'],
                evidence=evidence)
            )


def _agent_from_id(db_id):
    # There are some Ensembl protein IDs which we currently can't normalize
    # to anything else (unlike ENSG).
    if db_id.startswith('ENSP'):
        db_refs = {'ENSEMBL': db_id}
        name = db_id
    # All other entries are UniProt IDs
    else:
        name = uniprot_client.get_gene_name(db_id)
        if not name:
            return None
        if '-' in db_id:
            up_base = db_id.split('-')[0]
            db_refs = {'UP': up_base, 'UPISO': db_id}
        else:
            db_refs = {'UP': db_id}
        hgnc_id = uniprot_client.get_hgnc_id(db_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
    return Agent(name, db_refs=db_refs)


def _agent_from_str(txt):
    """Return a grounded Agent from the name of an entity.

    Parameters
    ----------
    txt : str
        A string representing an entity

    Returns
    -------
    ag : indra.statements.Agent
        A grounded INDRA Agent corresponding to the provided entity text.
    """
    # Check if kinase is in the hard coded mapping
    if txt in phosphoelm_mapping:
        name = txt
        ns, _id = phosphoelm_mapping[txt]
        # If None is hard coded in the map it means we should skip this
        if ns is None:
            return None
    else:
        term = _gilda_grounder(txt)

        if term is None:
            name = txt
            ns, _id = None, None
        else:
            name = term['entry_name']
            ns, _id = term['db'], term['id']

    db_refs = {'TEXT': txt}
    if ns is not None and _id is not None:
        db_refs[ns] = _id
    if ns == 'HGNC':
        up_id = hgnc_client.get_uniprot_id(_id)
        if up_id:
            db_refs['UP'] = up_id
        name = hgnc_client.get_hgnc_name(_id)
    elif ns == 'FPLX':
        name = _id
    return Agent(name, db_refs=db_refs)


def _gilda_grounder(txt):
    # Pre-process text for grounding
    txt = txt.replace("_group", "")
    txt = txt.replace("_", "-")
    txt = txt.split("/")[0]
    res = gilda.ground(txt)
    if not res:
        logger.warning(f"Gilda grounder returned no results for {txt}")
        return None
    top_term = res[0].term
    return top_term.to_json()