import logging
import requests

from indra.databases import uniprot_client, hgnc_client
from indra.statements.validate import validate_text_refs
from indra.statements import Phosphorylation, Evidence, Agent

from .phosphoelm_mapping import phosphoelm_mapping

gilda_url = ''
logger = logging.getLogger(__name__)

[docs]class PhosphoElmProcessor(object): """Processes data dumps from the phospho.ELM database. See Parameters ---------- phosphoelm_data : list[dict] JSON compatible list of entries from a phospho.ELM data dump Attributes ---------- statements : list[indra.statements.Phosphorylation] A list of the phosphorylation statements produced by the entries in phosphoelm_data """ def __init__(self, phosphoelm_data): self.statements = [] self._phosphoelm_data = phosphoelm_data
[docs] def process_phosphorylations(self, skip_empty=True): """Create Phosphorylation statements from phosphoelm_data Parameters ---------- skip_empty : bool Default: True. If False, also create statements when upstream kinases in entry['kinases'] are not known. """ for entry in self._phosphoelm_data: if entry['species'].lower() != 'homo sapiens' or\ skip_empty and not entry['kinases']: # Skip entries without any kinases or if species is other # than human. continue # Entries: # 'acc': '<UP ID>', <-- substrate # 'sequence': '<protein sequence>', # 'position': '<sequence position>', # 'code': '<phosphorylated residue>', # 'pmids': '<pmid>', # 'kinases': '<responsible kinase>', <-- enzyme # 'source': 'HTP|LTP', # 'species': '<species name in latin>', # 'entry_date': 'yyyy-mm-dd HH:MM:SS.mmmmmm' substrate = _agent_from_id(entry['acc']) enzyme = _agent_from_str(entry['kinases']) # Skip if enz is None instead of an Agent (only when we skip # empty kinase entries) if skip_empty and enzyme is None: continue pmid = entry['pmids'] if not validate_text_refs({'PMID': pmid}): pmid = None # Build evidence, add statement evidence = Evidence( source_api='phosphoelm', pmid=pmid, annotations={ 'data_source': entry.get('source'), 'phosphoelm_substrate_id': entry['acc'], 'phosphoelm_kinase_name': entry.get('kinases'), 'entry_date': entry['entry_date'], 'sequence': entry['sequence'] } ) self.statements.append(Phosphorylation( enz=enzyme, sub=substrate, residue=entry['code'], position=entry['position'], evidence=evidence) )
def _agent_from_id(db_id): # There are some Ensembl protein IDs which we currently can't normalize # to anything else (unlike ENSG). if db_id.startswith('ENSP'): db_refs = {'ENSEMBL': db_id} name = db_id # All other entries are UniProt IDs else: name = uniprot_client.get_gene_name(db_id) if not name: return None if '-' in db_id: up_base = db_id.split('-')[0] db_refs = {'UP': up_base, 'UPISO': db_id} else: db_refs = {'UP': db_id} hgnc_id = uniprot_client.get_hgnc_id(db_id) if hgnc_id: db_refs['HGNC'] = hgnc_id return Agent(name, db_refs=db_refs) def _agent_from_str(txt): """Return a grounded Agent from the name of an entity. Parameters ---------- txt : str A string representing an entity Returns ------- ag : indra.statements.Agent A grounded INDRA Agent corresponding to the provided entity text. """ # Check if kinase is in the hard coded mapping if txt in phosphoelm_mapping: name = txt ns, _id = phosphoelm_mapping[txt] # If None is hard coded in the map it means we should skip this if ns is None: return None else: term = _gilda_grounder(txt) if term is None: name = txt ns, _id = None, None else: name = term['entry_name'] ns, _id = term['db'], term['id'] db_refs = {'TEXT': txt} if ns is not None and _id is not None: db_refs[ns] = _id if ns == 'HGNC': up_id = hgnc_client.get_uniprot_id(_id) if up_id: db_refs['UP'] = up_id name = hgnc_client.get_hgnc_name(_id) elif ns == 'FPLX': name = _id return Agent(name, db_refs=db_refs) def _gilda_grounder(txt): # Pre-process text for grounding txt = txt.replace('_group', '') txt = txt.replace('_', '-') txt = txt.split('/')[0] res =, json={'text': txt}) if res.status_code != 200: logger.warning('Gilda service responded with status code %d' % res.status_code) return None rj = res.json() if not rj: return None top_term = rj[0]['term'] return top_term