Source code for indra.sources.tas.processor

__all__ = ['TasProcessor']

import logging
from indra.statements import Inhibition, Agent, Evidence
from indra.statements.validate import assert_valid_db_refs
from indra.ontology.standardize import standardize_name_db_refs, \
    get_standard_agent
from indra.databases import hgnc_client, chembl_client, lincs_client


logger = logging.getLogger(__name__)


CLASS_MAP = {'1': 'Kd < 100nM', '2': '100nM < Kd < 1uM',
             '3': '1uM < Kd < 10uM', '10': 'Kd > 10uM'}


lincs_client_obj = lincs_client.LincsClient()


[docs]class TasProcessor(object):
    """A processor for the Target Affinity Spectrum data table."""
    def __init__(self, data, affinity_class_limit=2, named_only=False,
                 standardized_only=False):
        self._data = data
        self.affinity_class_limit = affinity_class_limit
        self.named_only = named_only
        self.standardized_only = standardized_only

        self.statements = []
        for row in data:
            # Skip rows that are above the affinity class limit
            if int(row['tas']) > affinity_class_limit:
                continue
            self._process_row(row)
        return

    def _process_row(self, row):
        drugs = self._extract_drugs(row['compound_ids'], row['lspci_id'])
        prot = self._extract_protein(row['entrez_gene_symbol'],
                                     row['entrez_gene_id'])
        evidences = self._make_evidences(row['tas'], row['references'])
        # NOTE: there are several entries in this data set that refer to
        # non-human Entrez genes, e.g.
        # https://www.ncbi.nlm.nih.gov/gene/3283880
        # We skip these for now because resources for Entrez-based
        # mappings for non-human genes are not integrated, and would cause
        # pre-assembly issues.
        if 'HGNC' not in prot.db_refs:
            return
        for drug in drugs:
            self.statements.append(Inhibition(drug, prot, evidence=evidences))

    def _extract_drugs(self, compound_ids, lspci_id):
        drugs = []
        for id_ in compound_ids.split('|'):
            db_refs = {'LSPCI': lspci_id}
            if id_.startswith('CHEMBL'):
                db_refs['CHEMBL'] = id_
            elif id_.startswith('HMSL'):
                db_refs['HMS-LINCS'] = id_.split('HMSL')[1]
            else:
                logger.warning('Unhandled ID type: %s' % id_)
            # Name standardization finds correct names but because
            # ChEMBL is incomplete as a local resource, we don't
            # universally standardize its names, instead, we look
            # it up explicitly when necessary.
            name, db_refs = standardize_name_db_refs(db_refs)
            if name is None:
                # This is one way to detect that the drug could not be
                # standardized beyond just its name so in the
                # standardized_only condition, we skip this drug
                if self.standardized_only:
                    continue
                elif 'HMS-LINCS' in db_refs:
                    name = \
                        lincs_client_obj.get_small_molecule_name(
                            db_refs['HMS-LINCS'])
                elif 'CHEMBL' in db_refs:
                    name = chembl_client.get_chembl_name(db_refs['CHEMBL'])
            # If name is still None, we just use the ID as the name
            if name is None:
                # With the named_only restriction, we skip drugs without
                # a proper name.
                if self.named_only:
                    continue
                name = id_
            assert_valid_db_refs(db_refs)
            drugs.append(Agent(name, db_refs=db_refs))
        drugs = list({agent.matches_key():
                      agent for agent in drugs}.values())
        return drugs

    def _extract_protein(self, name, gene_id):
        db_refs = {'EGID': gene_id}
        hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id)
        if hgnc_id is not None:
            db_refs['HGNC'] = hgnc_id
        return get_standard_agent(name, db_refs=db_refs)

    def _make_evidences(self, class_min, references):
        evidences = []
        for reference in references.split('|'):
            pmid, source_id, text_refs = None, None, None
            annotations = {'class_min': CLASS_MAP[class_min]}
            ref, id_ = reference.split(':')
            if ref == 'pubmed':
                pmid = id_
                text_refs = {'PMID': pmid}
            elif ref == 'doi':
                text_refs = {'DOI': id_}
            else:
                source_id = reference
            ev = Evidence(source_api='tas', source_id=source_id, pmid=pmid,
                          annotations=annotations, epistemics={'direct': True},
                          text_refs=text_refs)
            evidences.append(ev)
        return evidences