import logging
from numpy import nan
from collections import Counter
from protmapper import uniprot_client, ProtMapper
from indra.statements import *
from indra.databases import hgnc_client
from indra.statements.validate import validate_id
logger = logging.getLogger(__name__)
# Map of HPRD PTM types to INDRA Statement types
_ptm_map = {
'ADP-Ribosylation': None,
'Acetylation': Acetylation,
'Alkylation': None,
'Amidation': None,
'Carboxylation': None,
'Deacetylation': Deacetylation,
'Deacylation': None,
'Deglycosylation': Deglycosylation,
'Demethylation': Demethylation,
'Dephosphorylation': Dephosphorylation,
'Disulfide Bridge': None,
'Farnesylation': Farnesylation,
'Glucuronosyl transfer': None,
'Glutathionylation': None,
'Glycation': None,
'Glycosyl phosphatidyl inositol GPI': None,
'Glycosylation': Glycosylation,
'Hydroxylation': Hydroxylation,
'Methylation': Methylation,
'Myristoylation': Myristoylation,
'Neddylation': None,
'Nitration': None,
'Palmitoylation': Palmitoylation,
'Phosphorylation': Phosphorylation,
'Prenylation': None,
'Proteolytic Cleavage': None,
'S-Nitrosylation': None,
'Sulfation': None,
'Sumoylation': Sumoylation,
'Transglutamination': None,
'Ubiquitination': Ubiquitination,
}
[docs]class HprdProcessor(object):
"""Get INDRA Statements from HPRD data.
See documentation for `indra.sources.hprd.api.process_flat_files.`
Parameters
----------
id_df : pandas.DataFrame
DataFrame loaded from the HPRD_ID_MAPPINGS.txt file.
cplx_df : pandas.DataFrame
DataFrame loaded from the PROTEIN_COMPLEXES.txt file.
ptm_df : pandas.DataFrame
DataFrame loaded from the POST_TRANSLATIONAL_MODIFICATIONS.txt file.
ppi_df : pandas.DataFrame
DataFrame loaded from the BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt file.
seq_dict : dict
Dictionary mapping RefSeq IDs to protein sequences, loaded from the
PROTEIN_SEQUENCES.txt file.
motif_window : int
Number of flanking amino acids to include on each side of the
PTM target residue in the 'site_motif' annotations field of the
Evidence for Modification Statements. Default is 7.
Attributes
----------
statements : list of INDRA Statements
INDRA Statements (Modifications and Complexes) produced from the
HPRD content.
id_df : pandas.DataFrame
DataFrame loaded from HPRD_ID_MAPPINGS.txt file.
seq_dict :
Dictionary mapping RefSeq IDs to protein sequences, loaded from the
PROTEIN_SEQUENCES.txt file.
no_hgnc_for_egid: collections.Counter
Counter listing Entrez gene IDs reference in the HPRD content that
could not be mapped to a current HGNC ID, along with their frequency.
no_up_for_hgnc : collections.Counter
Counter with tuples of form (entrez_id, hgnc_symbol, hgnc_id) where
the HGNC ID could not be mapped to a Uniprot ID, along with their
frequency.
no_up_for_refseq : collections.Counter
Counter of RefSeq protein IDs that could not be mapped to any Uniprot
ID, along with frequency.
many_ups_for_refseq : collections.Counter
Counter of RefSeq protein IDs that yielded more than one matching
Uniprot ID. Note that in these cases, the Uniprot ID obtained from
HGNC is used.
invalid_site_pos : list of tuples
List of tuples of form (refseq_id, residue, position) indicating sites
of post translational modifications where the protein sequences
provided by HPRD did not contain the given residue at the given
position.
off_by_one : list of tuples
The subset of sites contained in `invalid_site_pos` where the given
residue can be found at position+1 in the HPRD protein sequence,
suggesting an off-by-one error due to numbering based on the
protein with initial methionine cleaved. Note that no mapping is
performed by the processor.
motif_window : int
Number of flanking amino acids to include on each side of the
PTM target residue in the 'site_motif' annotations field of the
Evidence for Modification Statements. Default is 7.
"""
def __init__(self, id_df, cplx_df=None, ptm_df=None, ppi_df=None,
seq_dict=None, motif_window=7):
if cplx_df is None and ptm_df is None and ppi_df is None:
raise ValueError('At least one of cplx_df, ptm_df, or ppi_df must '
'be specified.')
if ptm_df is not None and not seq_dict:
raise ValueError('If ptm_df is given, seq_dict must also be given.')
self.statements = []
self.id_df = id_df
self.seq_dict = seq_dict
self.motif_window = motif_window
# Keep track of the ID mapping issues encountered
self.no_hgnc_for_egid = Counter()
self.no_up_for_hgnc = Counter()
self.no_up_for_refseq = Counter()
self.many_ups_for_refseq = Counter()
self.invalid_site_pos = []
self.off_by_one = []
# Do the actual processing
if cplx_df is not None:
self.get_complexes(cplx_df)
if ptm_df is not None:
self.get_ptms(ptm_df)
if ppi_df is not None:
self.get_ppis(ppi_df)
# Tabulate IDs causing issues
self.no_hgnc_for_egid = Counter(self.no_hgnc_for_egid)
self.no_up_for_hgnc = Counter(self.no_up_for_hgnc)
self.no_up_for_refseq = Counter(self.no_up_for_refseq)
self.many_ups_for_refseq = Counter(self.many_ups_for_refseq)
logger.info('For information on problematic entries encountered while '
'processing, see HprdProcessor attributes: '
'no_hgnc_for_egid, no_up_for_hgnc, no_up_for_refseq, '
'many_ups_for_refseq, invalid_site_pos, and off_by_one.')
[docs] def get_complexes(self, cplx_df):
"""Generate Complex Statements from the HPRD protein complexes data.
Parameters
----------
cplx_df : pandas.DataFrame
DataFrame loaded from the PROTEIN_COMPLEXES.txt file.
"""
# Group the agents for the complex
logger.info('Processing complexes...')
for cplx_id, this_cplx in cplx_df.groupby('CPLX_ID'):
agents = []
for hprd_id in this_cplx.HPRD_ID:
ag = self._make_agent(hprd_id)
if ag is not None:
agents.append(ag)
# Make sure we got some agents!
if not agents or len(agents) < 2:
continue
# Get evidence info from first member of complex
row0 = this_cplx.iloc[0]
isoform_id = '%s_1' % row0.HPRD_ID
ev_list = self._get_evidence(row0.HPRD_ID, isoform_id, row0.PMIDS,
row0.EVIDENCE, 'interactions')
stmt = Complex(agents, evidence=ev_list)
self.statements.append(stmt)
[docs] def get_ptms(self, ptm_df):
"""Generate Modification statements from the HPRD PTM data.
Parameters
----------
ptm_df : pandas.DataFrame
DataFrame loaded from the POST_TRANSLATIONAL_MODIFICATIONS.txt file.
"""
logger.info('Processing PTMs...')
# Iterate over the rows of the dataframe
for ix, row in ptm_df.iterrows():
# Check the modification type; if we can't make an INDRA statement
# for it, then skip it
ptm_class = _ptm_map[row['MOD_TYPE']]
if ptm_class is None:
continue
# Use the Refseq protein ID for the substrate to make sure that
# we get the right Uniprot ID for the isoform
sub_ag = self._make_agent(row['HPRD_ID'],
refseq_id=row['REFSEQ_PROTEIN'])
# If we couldn't get the substrate, skip the statement
if sub_ag is None:
continue
enz_id = _nan_to_none(row['ENZ_HPRD_ID'])
enz_ag = self._make_agent(enz_id)
res = _nan_to_none(row['RESIDUE'])
pos = _nan_to_none(row['POSITION'])
if pos is not None and ';' in pos:
pos, dash = pos.split(';')
assert dash == '-'
# As a fallback for later site mapping, we also get the protein
# sequence information in case there was a problem with the
# RefSeq->Uniprot mapping
assert res
assert pos
motif_dict = self._get_seq_motif(row['REFSEQ_PROTEIN'], res, pos)
# Get evidence
ev_list = self._get_evidence(
row['HPRD_ID'], row['HPRD_ISOFORM'], row['PMIDS'],
row['EVIDENCE'], 'ptms', motif_dict)
stmt = ptm_class(enz_ag, sub_ag, res, pos, evidence=ev_list)
self.statements.append(stmt)
[docs] def get_ppis(self, ppi_df):
"""Generate Complex Statements from the HPRD PPI data.
Parameters
----------
ppi_df : pandas.DataFrame
DataFrame loaded from the BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt
file.
"""
logger.info('Processing PPIs...')
for ix, row in ppi_df.iterrows():
hprd_id_a = row['HPRD_ID_A'].strip()
hprd_id_b = row['HPRD_ID_B'].strip()
agA = self._make_agent(hprd_id_a)
agB = self._make_agent(hprd_id_b)
# If don't get valid agents for both, skip this PPI
if agA is None or agB is None:
continue
isoform_id = '%s_1' % hprd_id_a
ev_list = self._get_evidence(
hprd_id_a, isoform_id, row['PMIDS'],
row['EVIDENCE'], 'interactions')
stmt = Complex([agA, agB], evidence=ev_list)
self.statements.append(stmt)
def _make_agent(self, hprd_id, refseq_id=None):
if hprd_id is None or hprd_id is nan:
return None
# Get the basic info (HGNC name/symbol, Entrez ID) from the
# ID mappings dataframe
try:
egid = self.id_df.loc[hprd_id].EGID
except KeyError:
logger.info('HPRD ID %s not found in mappings table.' % hprd_id)
return None
if not egid:
logger.info('No Entrez ID for HPRD ID %s' % hprd_id)
return None
# Get the HGNC ID
hgnc_id = hgnc_client.get_hgnc_from_entrez(egid)
# If we couldn't get an HGNC ID for the Entrez ID, this means that
# the Entrez ID has been discontinued or replaced.
if not hgnc_id:
self.no_hgnc_for_egid.update(egid)
return None
# Get the (possibly updated) HGNC Symbol
hgnc_name = hgnc_client.get_hgnc_name(hgnc_id)
assert hgnc_name is not None
# See if we can get a Uniprot ID from the HGNC symbol--if there is
# a RefSeq ID we wil also try to use it to get an isoform specific
# UP ID, but we will have this one to fall back on. But if we can't
# get one here, then we skip the Statement
up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id)
if not up_id_from_hgnc:
self.no_up_for_hgnc.update((egid, hgnc_name, hgnc_id))
return None
# If we have provided the RefSeq ID, it's because we need to make
# sure that we are getting the right isoform-specific ID (for sequence
# positions of PTMs). Here we try to get the Uniprot ID from the
# Refseq->UP mappings in the protmapper.uniprot_client.
if refseq_id is not None:
if not validate_id('REFSEQ_PROT', refseq_id):
if validate_id('NCBIPROTEIN', refseq_id):
refseq_ns = 'NCBIPROTEIN'
else:
refseq_ns = None
else:
refseq_ns = 'REFSEQ_PROT'
if refseq_ns == 'REFSEQ_PROT':
# Get the Uniprot IDs from the uniprot client
up_ids = uniprot_client.get_ids_from_refseq(refseq_id,
reviewed_only=True)
else:
up_ids = []
# Nothing for this RefSeq ID (quite likely because the RefSeq
# ID is obsolete; take the UP ID from HGNC
if len(up_ids) == 0:
self.no_up_for_refseq.update(refseq_id)
up_id = up_id_from_hgnc
# More than one reviewed entry--no thanks, we'll take the one
# from HGNC instead
elif len(up_ids) > 1:
self.many_ups_for_refseq.update(refseq_id)
up_id = up_id_from_hgnc
# We got a unique, reviewed UP entry for the RefSeq ID
else:
up_id = up_ids[0]
# If it's the canonical isoform, strip off the '-1'
if up_id.endswith('-1'):
up_id = up_id.split('-')[0]
# For completeness, get the Refseq ID from the HPRD ID table
else:
refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN
if not validate_id('REFSEQ_PROT', refseq_id):
if validate_id('NCBIPROTEIN', refseq_id):
refseq_ns = 'NCBIPROTEIN'
else:
refseq_ns = None
else:
refseq_ns = 'REFSEQ_PROT'
up_id = up_id_from_hgnc
# Make db_refs, return Agent
db_refs = {}
if hgnc_id:
db_refs['HGNC'] = hgnc_id
if up_id:
if ',' in up_id:
pass
elif '-' in up_id:
up_base = up_id.split('-')[0]
db_refs['UP'] = up_base
db_refs['UPISO'] = up_id
else:
db_refs['UP'] = up_id
if egid:
db_refs['EGID'] = egid
if refseq_ns and refseq_id:
db_refs[refseq_ns] = refseq_id
return Agent(hgnc_name, db_refs=db_refs)
def _get_evidence(self, hprd_id, isoform_id, pmid_str, evidence_type,
info_type, motif_dict=None):
pmids = pmid_str.split(',')
ev_list = []
if motif_dict is None:
motif_dict = {}
for pmid in pmids:
ev_type_list = [] if evidence_type is nan \
else evidence_type.split(';')
# Add the annotations with the site motif info if relevant
annotations = {'evidence': ev_type_list}
annotations.update(motif_dict)
ev = Evidence(source_api='hprd',
source_id=_hprd_url(hprd_id, isoform_id, info_type),
pmid=pmid, annotations=annotations)
ev_list.append(ev)
return ev_list
def _get_seq_motif(self, refseq_id, residue, pos_str):
seq = self.seq_dict[refseq_id]
pos_1ix = int(pos_str)
pos_0ix = pos_1ix - 1
if seq[pos_0ix] != residue:
self.invalid_site_pos.append((refseq_id, residue, pos_str))
if seq[pos_0ix + 1] == residue:
self.off_by_one.append((refseq_id, residue, pos_str))
motif, respos = \
ProtMapper.motif_from_position_seq(seq, pos_1ix + 1,
self.motif_window)
return {'site_motif': {'motif': motif, 'respos': respos,
'off_by_one': True}}
else:
return {}
else:
# The index of the residue at the start of the window
motif, respos = ProtMapper.motif_from_position_seq(seq, pos_1ix,
self.motif_window)
return {'site_motif': {'motif': motif, 'respos': respos,
'off_by_one': False}}
def _hprd_url(hprd_id, isoform_id, info_type):
if info_type == 'interactions':
return ('http://hprd.org/interactions?hprd_id=%s&isoform_id=%s'
'&isoform_name=Isoform_1' % (hprd_id, isoform_id))
elif info_type == 'ptms':
isoform_num = isoform_id.split('_')[1]
return ('http://hprd.org/ptms?hprd_id=%s&isoform_id=%s'
'&isoform_name=Isoform_%s' % (hprd_id, isoform_id, isoform_num))
else:
raise ValueError('info_type must be either interactions or ptms.')
def _nan_to_none(val):
return None if val is nan else val