import logging
import textwrap
from copy import deepcopy
from functools import lru_cache
from protmapper.api import ProtMapper, default_site_map
from indra.statements import *
from indra.databases import hgnc_client
logger = logging.getLogger(__name__)
[docs]class MappedStatement(object):
"""Information about a Statement found to have invalid sites.
Parameters
----------
original_stmt : :py:class:`indra.statements.Statement`
The statement prior to mapping.
mapped_mods : list of MappedSite
A list of MappedSite objects.
mapped_stmt : :py:class:`indra.statements.Statement`
The statement after mapping. Note that if no information was found
in the site map, it will be identical to the original statement.
"""
def __init__(self, original_stmt, mapped_mods, mapped_stmt):
self.original_stmt = original_stmt
self.mapped_mods = mapped_mods
self.mapped_stmt = mapped_stmt
def __str__(self):
if not self.mapped_mods:
mm_str = str(self.mapped_mods)
else:
mm_ws = '\n' + (' ' * 17)
mm_str = mm_ws.join([str(mm) for mm in self.mapped_mods])
summary = textwrap.dedent("""
MappedStatement:
original_stmt: {0}
mapped_mods: {1}
mapped_stmt: {2}
""")
return summary.format(self.original_stmt, mm_str, self.mapped_stmt)
def __repr__(self):
return str(self)
[docs]class SiteMapper(ProtMapper):
"""
Use site information to fix modification sites in Statements.
This is a wrapper around the protmapper package's ProtMapper class and adds
all the additional functionality to handle INDRA Statements and Agents.
Parameters
----------
site_map : dict (as returned by :py:func:`load_site_map`)
A dict mapping tuples of the form `(gene, orig_res, orig_pos)` to a
tuple of the form `(correct_res, correct_pos, comment)`, where `gene`
is the string name of the gene (canonicalized to HGNC); `orig_res` and
`orig_pos` are the residue and position to be mapped; `correct_res` and
`correct_pos` are the corrected residue and position, and `comment` is
a string describing the reason for the mapping (species error, isoform
error, wrong residue name, etc.).
use_cache : Optional[bool]
If True, the SITEMAPPER_CACHE_PATH from the config (or environment)
is loaded and cached mappings are read and written to the given path.
Otherwise, no cache is used. Default: False
do_methionine_offset : boolean
Whether to check for off-by-one errors in site position (possibly)
attributable to site numbering from mature proteins after
cleavage of the initial methionine. If True, checks the reference
sequence for a known modification at 1 site position greater
than the given one; if there exists such a site, creates the
mapping. Default is True.
do_orthology_mapping : boolean
Whether to check sequence positions for known modification sites
in mouse or rat sequences (based on PhosphoSitePlus data). If a
mouse/rat site is found that is linked to a site in the human
reference sequence, a mapping is created. Default is True.
do_isoform_mapping : boolean
Whether to check sequence positions for known modifications
in other human isoforms of the protein (based on PhosphoSitePlus
data). If a site is found that is linked to a site in the human
reference sequence, a mapping is created. Default is True.
Examples
--------
Fixing site errors on both the modification state of an agent (MAP2K1) and
the target of a Phosphorylation statement (MAPK1):
>>> map2k1_phos = Agent('MAP2K1', db_refs={'UP':'Q02750'}, mods=[
... ModCondition('phosphorylation', 'S', '217'),
... ModCondition('phosphorylation', 'S', '221')])
>>> mapk1 = Agent('MAPK1', db_refs={'UP':'P28482'})
>>> stmt = Phosphorylation(map2k1_phos, mapk1, 'T','183')
>>> (valid, mapped) = default_mapper.map_sites([stmt])
>>> valid
[]
>>> mapped
[
MappedStatement:
original_stmt: Phosphorylation(MAP2K1(mods: (phosphorylation, S, 217), (phosphorylation, S, 221)), MAPK1(), T, 183)
mapped_mods: MappedSite(up_id='Q02750', error_code=None, valid=False, orig_res='S', orig_pos='217', mapped_id='Q02750', mapped_res='S', mapped_pos='218', description='off by one', gene_name='MAP2K1')
MappedSite(up_id='Q02750', error_code=None, valid=False, orig_res='S', orig_pos='221', mapped_id='Q02750', mapped_res='S', mapped_pos='222', description='off by one', gene_name='MAP2K1')
MappedSite(up_id='P28482', error_code=None, valid=False, orig_res='T', orig_pos='183', mapped_id='P28482', mapped_res='T', mapped_pos='185', description='INFERRED_MOUSE_SITE', gene_name='MAPK1')
mapped_stmt: Phosphorylation(MAP2K1(mods: (phosphorylation, S, 218), (phosphorylation, S, 222)), MAPK1(), T, 185)
]
>>> ms = mapped[0]
>>> ms.original_stmt
Phosphorylation(MAP2K1(mods: (phosphorylation, S, 217), (phosphorylation, S, 221)), MAPK1(), T, 183)
>>> ms.mapped_mods
[MappedSite(up_id='Q02750', error_code=None, valid=False, orig_res='S', orig_pos='217', mapped_id='Q02750', mapped_res='S', mapped_pos='218', description='off by one', gene_name='MAP2K1'), MappedSite(up_id='Q02750', error_code=None, valid=False, orig_res='S', orig_pos='221', mapped_id='Q02750', mapped_res='S', mapped_pos='222', description='off by one', gene_name='MAP2K1'), MappedSite(up_id='P28482', error_code=None, valid=False, orig_res='T', orig_pos='183', mapped_id='P28482', mapped_res='T', mapped_pos='185', description='INFERRED_MOUSE_SITE', gene_name='MAPK1')]
>>> ms.mapped_stmt
Phosphorylation(MAP2K1(mods: (phosphorylation, S, 218), (phosphorylation, S, 222)), MAPK1(), T, 185)
"""
def __init__(self, site_map=None, use_cache=False, cache_path=None,
do_methionine_offset=True, do_orthology_mapping=True,
do_isoform_mapping=True):
super(SiteMapper, self).__init__(site_map, use_cache, cache_path)
self.do_methionine_offset = do_methionine_offset
self.do_orthology_mapping = do_orthology_mapping
self.do_isoform_mapping = do_isoform_mapping
def map_stmt_sites(self, stmt):
stmt_copy = deepcopy(stmt)
# For all statements, replace agents with invalid modifications
mapped_sites = []
new_agent_list = []
for agent in stmt.agent_list():
# If None agent, add to list and continue
if agent is None:
new_agent_list.append(agent)
continue
# Otherwise do mapping
agent_mapped_sites, new_agent = self._map_agent_sites(agent)
mapped_sites += agent_mapped_sites
# Site map agents in the bound conditions
for ind in range(len(new_agent.bound_conditions)):
b = new_agent.bound_conditions[ind].agent
agent_mapped_sites, new_b = self._map_agent_sites(b)
mapped_sites += agent_mapped_sites
new_agent.bound_conditions[ind].agent = new_b
new_agent_list.append(new_agent)
# If we made any actual mappings, we set the new agent list which
# includes any mapped agents
if any([ms.has_mapping() for ms in mapped_sites]):
stmt_copy.set_agent_list(new_agent_list)
# --- Special handling for these statements ---
# For modifications, fix residue and position
if (isinstance(stmt, Modification) or
isinstance(stmt, SelfModification)) and \
stmt.residue is not None and stmt.position is not None:
# Make sure we didn't end up with lists by accident
assert isinstance(stmt.residue, str) and \
isinstance(stmt.position, str)
# Get the right agent depending on whether this is a
# Modification or SelfModification statement
agent_to_check = (stmt_copy.sub
if isinstance(stmt, Modification)
else stmt_copy.enz)
# Check the modification on the appropriate agent
old_mod = stmt._get_mod_condition()
# Figure out if this site is invalid
stmt_mapped_site = self._map_agent_mod(agent_to_check, old_mod)
if stmt_mapped_site is not None:
# If we got a mapping for the site, we apply that mapping to the
# copy of the statement
if stmt_mapped_site.has_mapping():
stmt_copy.residue = stmt_mapped_site.mapped_res
stmt_copy.position = stmt_mapped_site.mapped_pos
# Additionally, if there is a mapped site object at all
# (whether or not there is an actual successful mapping), we
# add that to the list of mapped sites
mapped_sites.append(stmt_mapped_site)
# We only return a MappedStatement if it has at least one MappedSite
# that is known to be invalid (whether of not it as successfully
# mapped). Otherwise we return None.
if any([(ms is not None and not ms.not_invalid())
for ms in mapped_sites]):
mapped_stmt = MappedStatement(stmt, mapped_sites, stmt_copy)
else:
mapped_stmt = None
return mapped_stmt
[docs] def map_sites(self, stmts):
"""Check a set of statements for invalid modification sites.
Statements are checked against Uniprot reference sequences to determine
if residues referred to by post-translational modifications exist at
the given positions.
If there is nothing amiss with a statement (modifications on any of the
agents, modifications made in the statement, etc.), then the statement
goes into the list of valid statements. If there is a problem with the
statement, the offending modifications are looked up in the site map
(:py:attr:`site_map`), and an instance of :py:class:`MappedStatement`
is added to the list of mapped statements.
Parameters
----------
stmts : list of :py:class:`indra.statement.Statement`
The statements to check for site errors.
Returns
-------
tuple
2-tuple containing (valid_statements, mapped_statements). The first
element of the tuple is a list of valid statements
(:py:class:`indra.statement.Statement`) that were not found to
contain any site errors. The second element of the tuple is a list
of mapped statements (:py:class:`MappedStatement`) with information
on the incorrect sites and corresponding statements with correctly
mapped sites.
"""
valid_statements = []
mapped_statements = []
for stmt in stmts:
# Check for errors in the position str
# TODO: this could also be used on agent conditions, here
# it's only applied to statement position arguments
if isinstance(stmt, (Modification, SelfModification)) and \
not _valid_position_str(stmt.position):
continue
mapped_stmt = self.map_stmt_sites(stmt)
# If we got a MappedStatement as a return value, we add that to the
# list of mapped statements, otherwise, the original Statement is
# not invalid so we add it to the other list directly.
if mapped_stmt is not None:
mapped_statements.append(mapped_stmt)
else:
valid_statements.append(stmt)
return valid_statements, mapped_statements
def _map_agent_sites(self, agent):
"""Check an agent for invalid sites and update if necessary.
Parameters
----------
agent : :py:class:`indra.statements.Agent`
Agent to check for invalid modification sites.
Returns
-------
tuple
The first element is a list of MappedSite objects, the second
element is either the original Agent, if unchanged, or a copy
of it.
"""
# If there are no modifications on this agent, then we can return the
# copy of the agent
if agent is None or not agent.mods:
return [], agent
new_agent = deepcopy(agent)
mapped_sites = []
# Now iterate over all the modifications and map each one
for idx, mod_condition in enumerate(agent.mods):
mapped_site = \
self._map_agent_mod(agent, mod_condition)
# If we couldn't do the mapping or the mapped site isn't invalid
# then we don't need to change the existing ModCondition
if not mapped_site or mapped_site.not_invalid():
continue
# Otherwise, if there is a mapping, we replace the old ModCondition
# with the new one where only the residue and position are updated,
# the mod type and the is modified flag are kept.
if mapped_site.has_mapping():
mc = ModCondition(mod_condition.mod_type,
mapped_site.mapped_res,
mapped_site.mapped_pos,
mod_condition.is_modified)
new_agent.mods[idx] = mc
# Finally, whether or not we have a mapping, we keep track of mapped
# sites and make them available to the caller
mapped_sites.append(mapped_site)
return mapped_sites, new_agent
def _map_agent_mod(self, agent, mod_condition):
"""Map a single modification condition on an agent.
Parameters
----------
agent : :py:class:`indra.statements.Agent`
Agent to check for invalid modification sites.
mod_condition : :py:class:`indra.statements.ModCondition`
Modification to check for validity and map.
Returns
-------
protmapper.MappedSite or None
A MappedSite object is returned if a UniProt ID was found for the
agent, and if both the position and residue for the modification
condition were available. Otherwise None is returned.
"""
# Get the UniProt ID of the agent, if not found, return
up_id = _get_uniprot_id(agent)
if not up_id:
logger.debug("No uniprot ID for %s" % agent.name)
return None
# If no site information for this residue, skip
if mod_condition.position is None or mod_condition.residue is None:
return None
# Otherwise, try to map it and return the mapped site
mapped_site = \
self.map_to_human_ref(up_id, 'uniprot',
mod_condition.residue,
mod_condition.position,
do_methionine_offset=self.do_methionine_offset,
do_orthology_mapping=self.do_orthology_mapping,
do_isoform_mapping=self.do_isoform_mapping)
return mapped_site
default_mapper = SiteMapper(default_site_map)
# TODO: determine if this should be done in the protmapper or if this is the
# preferred place
@lru_cache(maxsize=10000)
def _get_uniprot_id(agent):
"""Return the UniProt ID for an agent, looking up in HGNC if necessary.
If the UniProt ID is a list then return the first ID by default.
"""
up_id = agent.db_refs.get('UP')
hgnc_id = agent.db_refs.get('HGNC')
if up_id is None:
if hgnc_id is None:
# If both UniProt and HGNC refs are missing we can't
# sequence check and so don't report a failure.
return None
# Try to get UniProt ID from HGNC
up_id = hgnc_client.get_uniprot_id(hgnc_id)
# If this fails, again, we can't sequence check
if up_id is None:
return None
# If the UniProt ID is a list then choose the first one.
if not isinstance(up_id, str) and \
isinstance(up_id[0], str):
up_id = up_id[0]
return up_id
def _valid_position_str(pos):
# None positions are valid
if pos is None:
return True
# Check if the string is a simple int
try:
return pos == str(int(float(pos)))
# If the float conversion of the string fails, (e.g., if it
# contains a non-numeric character), it is invalid
except ValueError:
return False