__all__ = ['TasProcessor']
import logging
from indra.statements import Inhibition, Agent, Evidence
from indra.statements.validate import assert_valid_db_refs
from indra.ontology.standardize import standardize_name_db_refs, \
get_standard_agent
from indra.databases import hgnc_client, chembl_client, lincs_client
logger = logging.getLogger(__name__)
CLASS_MAP = {'1': 'Kd < 100nM', '2': '100nM < Kd < 1uM',
'3': '1uM < Kd < 10uM', '10': 'Kd > 10uM'}
lincs_client_obj = lincs_client.LincsClient()
[docs]class TasProcessor(object):
"""A processor for the Target Affinity Spectrum data table."""
def __init__(self, data, affinity_class_limit=2, named_only=False,
standardized_only=False):
self._data = data
self.affinity_class_limit = affinity_class_limit
self.named_only = named_only
self.standardized_only = standardized_only
self.statements = []
for row in data:
# Skip rows that are above the affinity class limit
if int(row['tas']) > affinity_class_limit:
continue
self._process_row(row)
return
def _process_row(self, row):
drugs = self._extract_drugs(row['compound_ids'], row['lspci_id'])
prot = self._extract_protein(row['entrez_gene_symbol'],
row['entrez_gene_id'])
evidences = self._make_evidences(row['tas'], row['references'])
# NOTE: there are several entries in this data set that refer to
# non-human Entrez genes, e.g.
# https://www.ncbi.nlm.nih.gov/gene/3283880
# We skip these for now because resources for Entrez-based
# mappings for non-human genes are not integrated, and would cause
# pre-assembly issues.
if 'HGNC' not in prot.db_refs:
return
for drug in drugs:
self.statements.append(Inhibition(drug, prot, evidence=evidences))
def _extract_drugs(self, compound_ids, lspci_id):
drugs = []
for id_ in compound_ids.split('|'):
db_refs = {'LSPCI': lspci_id}
if id_.startswith('CHEMBL'):
db_refs['CHEMBL'] = id_
elif id_.startswith('HMSL'):
db_refs['HMS-LINCS'] = id_.split('HMSL')[1]
else:
logger.warning('Unhandled ID type: %s' % id_)
# Name standardization finds correct names but because
# ChEMBL is incomplete as a local resource, we don't
# universally standardize its names, instead, we look
# it up explicitly when necessary.
name, db_refs = standardize_name_db_refs(db_refs)
if name is None:
# This is one way to detect that the drug could not be
# standardized beyond just its name so in the
# standardized_only condition, we skip this drug
if self.standardized_only:
continue
elif 'HMS-LINCS' in db_refs:
name = \
lincs_client_obj.get_small_molecule_name(
db_refs['HMS-LINCS'])
elif 'CHEMBL' in db_refs:
name = chembl_client.get_chembl_name(db_refs['CHEMBL'])
# If name is still None, we just use the ID as the name
if name is None:
# With the named_only restriction, we skip drugs without
# a proper name.
if self.named_only:
continue
name = id_
assert_valid_db_refs(db_refs)
drugs.append(Agent(name, db_refs=db_refs))
drugs = list({agent.matches_key():
agent for agent in drugs}.values())
return drugs
def _extract_protein(self, name, gene_id):
db_refs = {'EGID': gene_id}
hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
if hgnc_id is not None:
db_refs['HGNC'] = hgnc_id
return get_standard_agent(name, db_refs=db_refs)
def _make_evidences(self, class_min, references):
evidences = []
for reference in references.split('|'):
pmid, source_id, text_refs = None, None, None
annotations = {'class_min': CLASS_MAP[class_min]}
ref, id_ = reference.split(':')
if ref == 'pubmed':
pmid = id_
text_refs = {'PMID': pmid}
elif ref == 'doi':
text_refs = {'DOI': id_}
else:
source_id = reference
ev = Evidence(source_api='tas', source_id=source_id, pmid=pmid,
annotations=annotations, epistemics={'direct': True},
text_refs=text_refs)
evidences.append(ev)
return evidences