Source code for indra.databases.lincs_client

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str

__all__ = ['get_drug_target_data', 'LincsClient', 'load_lincs_csv']

import os
import sys
import json
import logging
import requests
from io import StringIO, BytesIO
from indra.util import read_unicode_csv_fileobj
from indra.databases.identifiers import ensure_chembl_prefix


logger = logging.getLogger(__name__)


LINCS_URL = 'http://lincs.hms.harvard.edu/db'


resources = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                         os.path.pardir, 'resources')
lincs_sm = os.path.join(resources, 'lincs_small_molecules.json')
lincs_prot = os.path.join(resources, 'lincs_proteins.json')


[docs]class LincsClient(object): """Client for querying LINCS small molecules and proteins.""" def __init__(self): with open(lincs_sm, 'r') as fh: self._sm_data = json.load(fh) extra_sm_data = load_lincs_extras() self._sm_data.update(extra_sm_data) with open(lincs_prot, 'r') as fh: self._prot_data = json.load(fh)
[docs] def get_small_molecule_name(self, hms_lincs_id): """Get the name of a small molecule from the LINCS sm metadata. Parameters ---------- hms_lincs_id : str The HMS LINCS ID of the small molecule. Returns ------- str The name of the small molecule. """ entry = self._get_entry_by_id(self._sm_data, hms_lincs_id) if not entry: return None name = entry['Name'] return name
[docs] def get_small_molecule_refs(self, hms_lincs_id): """Get the id refs of a small molecule from the LINCS sm metadata. Parameters ---------- hms_lincs_id : str The HMS LINCS ID of the small molecule. Returns ------- dict A dictionary of references. """ refs = {'HMS-LINCS': hms_lincs_id} entry = self._get_entry_by_id(self._sm_data, hms_lincs_id) # If there is no entry for this ID if not entry: return refs # If there is an entry then fill up the refs with existing values mappings = dict(chembl='ChEMBL ID', chebi='ChEBI ID', pubchem='PubChem CID', lincs='LINCS ID') for k, v in mappings.items(): if entry.get(v): key = k.upper() value = entry[v] # Swap in primary PubChem IDs where there is an outdated one if key == 'PUBCHEM' and value in pc_to_primary_mappings: value = pc_to_primary_mappings[value] # Fix CHEMBL IDs if key == 'CHEMBL': value = ensure_chembl_prefix(value) refs[key] = value return refs
[docs] def get_protein_refs(self, hms_lincs_id): """Get the refs for a protein from the LINCs protein metadata. Parameters ---------- hms_lincs_id : str The HMS LINCS ID for the protein Returns ------- dict A dictionary of protein references. """ # TODO: We could get phosphorylation states from the protein data. refs = {'HMS-LINCS': hms_lincs_id} entry = self._get_entry_by_id(self._prot_data, hms_lincs_id) # If there is no entry for this ID if not entry: return refs mappings = dict(egid='Gene ID', up='UniProt ID') for k, v in mappings.items(): if entry.get(v): refs[k.upper()] = entry.get(v) return refs
def _get_entry_by_id(self, resource, hms_lincs_id): # This means it's a short ID if '-' not in hms_lincs_id: keys = [k for k in resource.keys() if k.startswith(hms_lincs_id)] if not keys: logger.debug('Couldn\'t find entry for %s' % hms_lincs_id) return None entry = resource[keys[0]] # This means it's a full ID else: entry = resource.get(hms_lincs_id) if not entry: logger.debug('Couldn\'t find entry for %s' % hms_lincs_id) return None return entry
[docs]def get_drug_target_data(): """Load the csv into a list of dicts containing the LINCS drug target data. Returns ------- data : list[dict] A list of dicts, each keyed based on the header of the csv, with values as the corresponding column values. """ url = LINCS_URL + '/datasets/20000/results' return load_lincs_csv(url)
def _build_db_refs(lincs_id, data, **mappings): db_refs = {'HMS-LINCS': lincs_id} for db_ref, key in mappings.items(): if data[key]: db_refs[db_ref.upper()] = data[key] return db_refs
[docs]def load_lincs_csv(url): """Helper function to turn csv rows into dicts.""" resp = requests.get(url, params={'output_type': '.csv'}, timeout=120) resp.raise_for_status() if sys.version_info[0] < 3: csv_io = BytesIO(resp.content) else: csv_io = StringIO(resp.text) data_rows = list(read_unicode_csv_fileobj(csv_io, delimiter=',')) headers = data_rows[0] return [{header: val for header, val in zip(headers, line_elements)} for line_elements in data_rows[1:]]
def load_lincs_extras(): fname = os.path.join(resources, 'hms_lincs_extra.tsv') with open(fname, 'r') as fh: rows = [line.strip('\n').split('\t') for line in fh.readlines()] return {r[0]: {'HMS LINCS ID': r[0], 'Name': r[1], 'ChEMBL ID': r[2] if r[2] else ''} for r in rows[1:]} # This is a set of mappings specific to HMS-LINCS that map outdated compound # IDs appearing in HMS-LINCS to preferred compound IDs. This can be obtained # more generally via indra.databases.pubchem_client, but this is a pre-compiled # version here for fast lookups in this client. pc_to_primary_mappings = \ {'23624255': '135564985', '10451420': '135465539', '10196499': '135398501', '57899889': '135564632', '53239990': '135564599', '71433937': '136240579', '53401173': '135539077', '71543332': '135398499', '5353940': '5169', '49830557': '135398510', '11258443': '135451019', '68925359': '135440466', '16750408': '135565545', '57347681': '135565635', '5357795': '92577', '56965966': '135398516', '24906282': '448949', '66524294': '135398492', '11696609': '135398495', '9549301': '135473382', '56965894': '135423438', }