Source code for indra.databases.lincs_client

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str

__all__ = ['get_drug_target_data', 'LincsClient', 'load_lincs_csv']

import os
import sys
import json
import logging
import requests
from io import StringIO, BytesIO
from indra.util import read_unicode_csv_fileobj
from indra.databases.identifiers import ensure_chembl_prefix


logger = logging.getLogger(__name__)


LINCS_URL = 'http://lincs.hms.harvard.edu/db'


resources = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                         os.path.pardir, 'resources')
lincs_sm = os.path.join(resources, 'lincs_small_molecules.json')
lincs_prot = os.path.join(resources, 'lincs_proteins.json')


[docs]class LincsClient(object):
    """Client for querying LINCS small molecules and proteins."""
    def __init__(self):
        with open(lincs_sm, 'r') as fh:
            self._sm_data = json.load(fh)
        extra_sm_data = load_lincs_extras()
        self._sm_data.update(extra_sm_data)

        with open(lincs_prot, 'r') as fh:
            self._prot_data = json.load(fh)

[docs]    def get_small_molecule_name(self, hms_lincs_id):
        """Get the name of a small molecule from the LINCS sm metadata.

        Parameters
        ----------
        hms_lincs_id : str
            The HMS LINCS ID of the small molecule.

        Returns
        -------
        str
            The name of the small molecule.
        """
        entry = self._get_entry_by_id(self._sm_data, hms_lincs_id)
        if not entry:
            return None
        name = entry['Name']
        return name

[docs]    def get_small_molecule_refs(self, hms_lincs_id):
        """Get the id refs of a small molecule from the LINCS sm metadata.

        Parameters
        ----------
        hms_lincs_id : str
            The HMS LINCS ID of the small molecule.

        Returns
        -------
        dict
            A dictionary of references.
        """
        refs = {'HMS-LINCS': hms_lincs_id}

        entry = self._get_entry_by_id(self._sm_data, hms_lincs_id)
        # If there is no entry for this ID
        if not entry:
            return refs

        # If there is an entry then fill up the refs with existing values
        mappings = dict(chembl='ChEMBL ID', chebi='ChEBI ID',
                        pubchem='PubChem CID', lincs='LINCS ID')
        for k, v in mappings.items():
            if entry.get(v):
                key = k.upper()
                value = entry[v]
                # Swap in primary PubChem IDs where there is an outdated one
                if key == 'PUBCHEM' and value in pc_to_primary_mappings:
                    value = pc_to_primary_mappings[value]
                # Fix CHEMBL IDs
                if key == 'CHEMBL':
                    value = ensure_chembl_prefix(value)
                refs[key] = value
        return refs

[docs]    def get_protein_refs(self, hms_lincs_id):
        """Get the refs for a protein from the LINCs protein metadata.

        Parameters
        ----------
        hms_lincs_id : str
            The HMS LINCS ID for the protein

        Returns
        -------
        dict
            A dictionary of protein references.
        """
        # TODO: We could get phosphorylation states from the protein data.
        refs = {'HMS-LINCS': hms_lincs_id}

        entry = self._get_entry_by_id(self._prot_data, hms_lincs_id)
        # If there is no entry for this ID
        if not entry:
            return refs
        mappings = dict(egid='Gene ID', up='UniProt ID')
        for k, v in mappings.items():
            if entry.get(v):
                refs[k.upper()] = entry.get(v)
        return refs

    def _get_entry_by_id(self, resource, hms_lincs_id):
        # This means it's a short ID
        if '-' not in hms_lincs_id:
            keys = [k for k in resource.keys() if k.startswith(hms_lincs_id)]
            if not keys:
                logger.debug('Couldn\'t find entry for %s' % hms_lincs_id)
                return None
            entry = resource[keys[0]]
        # This means it's a full ID
        else:
            entry = resource.get(hms_lincs_id)
            if not entry:
                logger.debug('Couldn\'t find entry for %s' % hms_lincs_id)
                return None
        return entry


[docs]def get_drug_target_data():
    """Load the csv into a list of dicts containing the LINCS drug target data.

    Returns
    -------
    data : list[dict]
        A list of dicts, each keyed based on the header of the csv, with values
        as the corresponding column values.
    """
    url = LINCS_URL + '/datasets/20000/results'
    return load_lincs_csv(url)


def _build_db_refs(lincs_id, data, **mappings):
    db_refs = {'HMS-LINCS': lincs_id}
    for db_ref, key in mappings.items():
        if data[key]:
            db_refs[db_ref.upper()] = data[key]
    return db_refs


[docs]def load_lincs_csv(url):
    """Helper function to turn csv rows into dicts."""
    resp = requests.get(url, params={'output_type': '.csv'}, timeout=120)
    resp.raise_for_status()
    if sys.version_info[0] < 3:
        csv_io = BytesIO(resp.content)
    else:
        csv_io = StringIO(resp.text)
    data_rows = list(read_unicode_csv_fileobj(csv_io, delimiter=','))
    headers = data_rows[0]
    return [{header: val for header, val in zip(headers, line_elements)}
            for line_elements in data_rows[1:]]


def load_lincs_extras():
    fname = os.path.join(resources, 'hms_lincs_extra.tsv')
    with open(fname, 'r') as fh:
        rows = [line.strip('\n').split('\t') for line in fh.readlines()]
    return {r[0]: {'HMS LINCS ID': r[0],
                   'Name': r[1],
                   'ChEMBL ID': r[2] if r[2] else ''}
            for r in rows[1:]}


# This is a set of mappings specific to HMS-LINCS that map outdated compound
# IDs appearing in HMS-LINCS to preferred compound IDs. This can be obtained
# more generally via indra.databases.pubchem_client, but this is a pre-compiled
# version here for fast lookups in this client.
pc_to_primary_mappings = \
    {'23624255': '135564985',
     '10451420': '135465539',
     '10196499': '135398501',
     '57899889': '135564632',
     '53239990': '135564599',
     '71433937': '136240579',
     '53401173': '135539077',
     '71543332': '135398499',
     '5353940': '5169',
     '49830557': '135398510',
     '11258443': '135451019',
     '68925359': '135440466',
     '16750408': '135565545',
     '57347681': '135565635',
     '5357795': '92577',
     '56965966': '135398516',
     '24906282': '448949',
     '66524294': '135398492',
     '11696609': '135398495',
     '9549301': '135473382',
     '56965894': '135423438',
     }