Source code for indra.sources.hprd.processor

import logging
from numpy import nan
from collections import Counter
from protmapper import uniprot_client, ProtMapper
from indra.statements import *
from indra.databases import hgnc_client
from indra.statements.validate import validate_id


logger = logging.getLogger(__name__)


# Map of HPRD PTM types to INDRA Statement types
_ptm_map = {
    'ADP-Ribosylation': None,
    'Acetylation': Acetylation,
    'Alkylation': None,
    'Amidation': None,
    'Carboxylation': None,
    'Deacetylation': Deacetylation,
    'Deacylation': None,
    'Deglycosylation': Deglycosylation,
    'Demethylation': Demethylation,
    'Dephosphorylation': Dephosphorylation,
    'Disulfide Bridge': None,
    'Farnesylation': Farnesylation,
    'Glucuronosyl transfer': None,
    'Glutathionylation': None,
    'Glycation': None,
    'Glycosyl phosphatidyl inositol GPI': None,
    'Glycosylation': Glycosylation,
    'Hydroxylation': Hydroxylation,
    'Methylation': Methylation,
    'Myristoylation': Myristoylation,
    'Neddylation': None,
    'Nitration': None,
    'Palmitoylation': Palmitoylation,
    'Phosphorylation': Phosphorylation,
    'Prenylation': None,
    'Proteolytic Cleavage': None,
    'S-Nitrosylation': None,
    'Sulfation': None,
    'Sumoylation': Sumoylation,
    'Transglutamination': None,
    'Ubiquitination': Ubiquitination,
}


[docs]class HprdProcessor(object):
    """Get INDRA Statements from HPRD data.

    See documentation for `indra.sources.hprd.api.process_flat_files.`

    Parameters
    ----------
    id_df : pandas.DataFrame
        DataFrame loaded from the HPRD_ID_MAPPINGS.txt file.
    cplx_df :  pandas.DataFrame
        DataFrame  loaded from the PROTEIN_COMPLEXES.txt file.
    ptm_df : pandas.DataFrame
        DataFrame loaded from the POST_TRANSLATIONAL_MODIFICATIONS.txt file.
    ppi_df : pandas.DataFrame
        DataFrame loaded from the BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt file.
    seq_dict : dict
        Dictionary mapping RefSeq IDs to protein sequences, loaded from the
        PROTEIN_SEQUENCES.txt file.
    motif_window : int
        Number of flanking amino acids to include on each side of the
        PTM target residue in the 'site_motif' annotations field of the
        Evidence for Modification Statements. Default is 7.

    Attributes
    ----------
    statements : list of INDRA Statements
        INDRA Statements (Modifications and Complexes) produced from the
        HPRD content.
    id_df : pandas.DataFrame
        DataFrame loaded from HPRD_ID_MAPPINGS.txt file.
    seq_dict : 
        Dictionary mapping RefSeq IDs to protein sequences, loaded from the
        PROTEIN_SEQUENCES.txt file.
    no_hgnc_for_egid: collections.Counter
        Counter listing Entrez gene IDs reference in the HPRD content that
        could not be mapped to a current HGNC ID, along with their frequency.
    no_up_for_hgnc : collections.Counter
        Counter with tuples of form (entrez_id, hgnc_symbol, hgnc_id) where
        the HGNC ID could not be mapped to a Uniprot ID, along with their
        frequency.
    no_up_for_refseq : collections.Counter
        Counter of RefSeq protein IDs that could not be mapped to any Uniprot
        ID, along with frequency.
    many_ups_for_refseq : collections.Counter
        Counter of RefSeq protein IDs that yielded more than one matching
        Uniprot ID. Note that in these cases, the Uniprot ID obtained from
        HGNC is used.
    invalid_site_pos : list of tuples
        List of tuples of form (refseq_id, residue, position) indicating sites
        of post translational modifications where the protein sequences
        provided by HPRD did not contain the given residue at the given
        position.
    off_by_one : list of tuples
        The subset of sites contained in `invalid_site_pos` where the given
        residue can be found at position+1 in the HPRD protein sequence,
        suggesting an off-by-one error due to numbering based on the
        protein with initial methionine cleaved. Note that no mapping is
        performed by the processor.
    motif_window : int
        Number of flanking amino acids to include on each side of the
        PTM target residue in the 'site_motif' annotations field of the
        Evidence for Modification Statements. Default is 7.
    """
    def __init__(self, id_df, cplx_df=None, ptm_df=None, ppi_df=None,
                 seq_dict=None, motif_window=7):
        if cplx_df is None and ptm_df is None and ppi_df is None:
            raise ValueError('At least one of cplx_df, ptm_df, or ppi_df must '
                             'be specified.')
        if ptm_df is not None and not seq_dict:
            raise ValueError('If ptm_df is given, seq_dict must also be given.')

        self.statements = []
        self.id_df = id_df
        self.seq_dict = seq_dict
        self.motif_window = motif_window

        # Keep track of the ID mapping issues encountered
        self.no_hgnc_for_egid = Counter()
        self.no_up_for_hgnc = Counter()
        self.no_up_for_refseq = Counter()
        self.many_ups_for_refseq = Counter()
        self.invalid_site_pos = []
        self.off_by_one = []

        # Do the actual processing
        if cplx_df is not None:
            self.get_complexes(cplx_df)
        if ptm_df is not None:
            self.get_ptms(ptm_df)
        if ppi_df is not None:
            self.get_ppis(ppi_df)

        # Tabulate IDs causing issues
        self.no_hgnc_for_egid = Counter(self.no_hgnc_for_egid)
        self.no_up_for_hgnc = Counter(self.no_up_for_hgnc)
        self.no_up_for_refseq = Counter(self.no_up_for_refseq)
        self.many_ups_for_refseq = Counter(self.many_ups_for_refseq)

        logger.info('For information on problematic entries encountered while '
                    'processing, see HprdProcessor attributes: '
                    'no_hgnc_for_egid, no_up_for_hgnc, no_up_for_refseq, '
                    'many_ups_for_refseq, invalid_site_pos, and off_by_one.')

[docs]    def get_complexes(self, cplx_df):
        """Generate Complex Statements from the HPRD protein complexes data.

        Parameters
        ----------
        cplx_df :  pandas.DataFrame
            DataFrame  loaded from the PROTEIN_COMPLEXES.txt file.
        """
        # Group the agents for the complex
        logger.info('Processing complexes...')
        for cplx_id, this_cplx in cplx_df.groupby('CPLX_ID'):
            agents = []
            for hprd_id in this_cplx.HPRD_ID:
                ag = self._make_agent(hprd_id)
                if ag is not None:
                    agents.append(ag)
            # Make sure we got some agents!
            if not agents or len(agents) < 2:
                continue
            # Get evidence info from first member of complex
            row0 = this_cplx.iloc[0]
            isoform_id = '%s_1' % row0.HPRD_ID
            ev_list = self._get_evidence(row0.HPRD_ID, isoform_id, row0.PMIDS,
                                         row0.EVIDENCE, 'interactions')
            stmt = Complex(agents, evidence=ev_list)
            self.statements.append(stmt)

[docs]    def get_ptms(self, ptm_df):
        """Generate Modification statements from the HPRD PTM data.

        Parameters
        ----------
        ptm_df : pandas.DataFrame
            DataFrame loaded from the POST_TRANSLATIONAL_MODIFICATIONS.txt file.
        """
        logger.info('Processing PTMs...')
        # Iterate over the rows of the dataframe
        for ix, row in ptm_df.iterrows():
            # Check the modification type; if we can't make an INDRA statement
            # for it, then skip it
            ptm_class = _ptm_map[row['MOD_TYPE']]
            if ptm_class is None:
                continue
            # Use the Refseq protein ID for the substrate to make sure that
            # we get the right Uniprot ID for the isoform
            sub_ag = self._make_agent(row['HPRD_ID'],
                                      refseq_id=row['REFSEQ_PROTEIN'])

            # If we couldn't get the substrate, skip the statement
            if sub_ag is None:
                continue
            enz_id = _nan_to_none(row['ENZ_HPRD_ID'])
            enz_ag = self._make_agent(enz_id)
            res = _nan_to_none(row['RESIDUE'])
            pos = _nan_to_none(row['POSITION'])
            if pos is not None and ';' in pos:
                pos, dash = pos.split(';')
                assert dash == '-'
            # As a fallback for later site mapping, we also get the protein
            # sequence information in case there was a problem with the
            # RefSeq->Uniprot mapping
            assert res
            assert pos
            motif_dict = self._get_seq_motif(row['REFSEQ_PROTEIN'], res, pos)
            # Get evidence
            ev_list = self._get_evidence(
                    row['HPRD_ID'], row['HPRD_ISOFORM'], row['PMIDS'],
                    row['EVIDENCE'], 'ptms', motif_dict)
            stmt = ptm_class(enz_ag, sub_ag, res, pos, evidence=ev_list)
            self.statements.append(stmt)

[docs]    def get_ppis(self, ppi_df):
        """Generate Complex Statements from the HPRD PPI data.

        Parameters
        ----------
        ppi_df : pandas.DataFrame
            DataFrame loaded from the BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt
            file.
        """
        logger.info('Processing PPIs...')
        for ix, row in ppi_df.iterrows():
            hprd_id_a = row['HPRD_ID_A'].strip()
            hprd_id_b = row['HPRD_ID_B'].strip()
            agA = self._make_agent(hprd_id_a)
            agB = self._make_agent(hprd_id_b)
            # If don't get valid agents for both, skip this PPI
            if agA is None or agB is None:
                continue
            isoform_id = '%s_1' % hprd_id_a
            ev_list = self._get_evidence(
                    hprd_id_a, isoform_id, row['PMIDS'],
                    row['EVIDENCE'], 'interactions')
            stmt = Complex([agA, agB], evidence=ev_list)
            self.statements.append(stmt)

    def _make_agent(self, hprd_id, refseq_id=None):
        if hprd_id is None or hprd_id is nan:
            return None
        # Get the basic info (HGNC name/symbol, Entrez ID) from the
        # ID mappings dataframe
        try:
            egid = self.id_df.loc[hprd_id].EGID
        except KeyError:
            logger.info('HPRD ID %s not found in mappings table.' % hprd_id)
            return None
        if not egid:
            logger.info('No Entrez ID for HPRD ID %s' % hprd_id)
            return None
        # Get the HGNC ID
        hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
        # If we couldn't get an HGNC ID for the Entrez ID, this means that
        # the Entrez ID has been discontinued or replaced.
        if not hgnc_id:
            self.no_hgnc_for_egid.update(egid)
            return None
        # Get the (possibly updated) HGNC Symbol
        hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
        assert hgnc_name is not None
        # See if we can get a Uniprot ID from the HGNC symbol--if there is
        # a RefSeq ID we wil also try to use it to get an isoform specific
        # UP ID, but we will have this one to fall back on. But if we can't
        # get one here, then we skip the Statement
        up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id)
        if not up_id_from_hgnc:
            self.no_up_for_hgnc.update((egid, hgnc_name, hgnc_id))
            return None
        # If we have provided the RefSeq ID, it's because we need to make
        # sure that we are getting the right isoform-specific ID (for sequence
        # positions of PTMs). Here we try to get the Uniprot ID from the
        # Refseq->UP mappings in the protmapper.uniprot_client.
        if refseq_id is not None:
            if not validate_id('REFSEQ_PROT', refseq_id):
                if validate_id('NCBIPROTEIN', refseq_id):
                    refseq_ns = 'NCBIPROTEIN'
                else:
                    refseq_ns = None
            else:
                refseq_ns = 'REFSEQ_PROT'
            if refseq_ns == 'REFSEQ_PROT':
                # Get the Uniprot IDs from the uniprot client
                up_ids = uniprot_client.get_ids_from_refseq(refseq_id,
                                                            reviewed_only=True)
            else:
                up_ids = []
            # Nothing for this RefSeq ID (quite likely because the RefSeq
            # ID is obsolete; take the UP ID from HGNC
            if len(up_ids) == 0:
                self.no_up_for_refseq.update(refseq_id)
                up_id = up_id_from_hgnc
            # More than one reviewed entry--no thanks, we'll take the one
            # from HGNC instead
            elif len(up_ids) > 1:
                self.many_ups_for_refseq.update(refseq_id)
                up_id = up_id_from_hgnc
            # We got a unique, reviewed UP entry for the RefSeq ID
            else:
                up_id = up_ids[0]
                # If it's the canonical isoform, strip off the '-1'
                if up_id.endswith('-1'):
                    up_id = up_id.split('-')[0]
        # For completeness, get the Refseq ID from the HPRD ID table
        else:
            refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN
            if not validate_id('REFSEQ_PROT', refseq_id):
                if validate_id('NCBIPROTEIN', refseq_id):
                    refseq_ns = 'NCBIPROTEIN'
                else:
                    refseq_ns = None
            else:
                refseq_ns = 'REFSEQ_PROT'
            up_id = up_id_from_hgnc
        # Make db_refs, return Agent
        db_refs = {}
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
        if up_id:
            if ',' in up_id:
                pass
            elif '-' in up_id:
                up_base = up_id.split('-')[0]
                db_refs['UP'] = up_base
                db_refs['UPISO'] = up_id
            else:
                db_refs['UP'] = up_id
        if egid:
            db_refs['EGID'] = egid
        if refseq_ns and refseq_id:
            db_refs[refseq_ns] = refseq_id
        return Agent(hgnc_name, db_refs=db_refs)

    def _get_evidence(self, hprd_id, isoform_id, pmid_str, evidence_type,
                      info_type, motif_dict=None):
        pmids = pmid_str.split(',')
        ev_list = []
        if motif_dict is None:
            motif_dict = {}
        for pmid in pmids:
            ev_type_list = [] if evidence_type is nan \
                               else evidence_type.split(';')
            # Add the annotations with the site motif info if relevant
            annotations = {'evidence': ev_type_list}
            annotations.update(motif_dict)
            ev = Evidence(source_api='hprd',
                          source_id=_hprd_url(hprd_id, isoform_id, info_type),
                          pmid=pmid, annotations=annotations)
            ev_list.append(ev)
        return ev_list

    def _get_seq_motif(self, refseq_id, residue, pos_str):
        seq = self.seq_dict[refseq_id]
        pos_1ix = int(pos_str)
        pos_0ix = pos_1ix - 1
        if seq[pos_0ix] != residue:
            self.invalid_site_pos.append((refseq_id, residue, pos_str))
            if seq[pos_0ix + 1] == residue:
                self.off_by_one.append((refseq_id, residue, pos_str))
                motif, respos = \
                   ProtMapper.motif_from_position_seq(seq, pos_1ix + 1,
                                                      self.motif_window)
                return {'site_motif': {'motif': motif, 'respos': respos,
                                       'off_by_one': True}}
            else:
                return {}
        else:
            # The index of the residue at the start of the window
            motif, respos = ProtMapper.motif_from_position_seq(seq, pos_1ix,
                                                             self.motif_window)
            return {'site_motif': {'motif': motif, 'respos': respos,
                                   'off_by_one': False}}


def _hprd_url(hprd_id, isoform_id, info_type):
    if info_type == 'interactions':
        return ('http://hprd.org/interactions?hprd_id=%s&isoform_id=%s'
                '&isoform_name=Isoform_1' % (hprd_id, isoform_id))
    elif info_type == 'ptms':
        isoform_num = isoform_id.split('_')[1]
        return ('http://hprd.org/ptms?hprd_id=%s&isoform_id=%s'
                '&isoform_name=Isoform_%s' % (hprd_id, isoform_id, isoform_num))
    else:
        raise ValueError('info_type must be either interactions or ptms.')


def _nan_to_none(val):
    return None if val is nan else val