Source code for indra.sources.tas.api

__all__ = ['process_csv', 'process_from_web']

import csv
import logging
import requests
from hashlib import md5

from .processor import TasProcessor
from indra.util import read_unicode_csv

tas_data_url = 'https://bigmech.s3.amazonaws.com/indra-db/tas.csv'
tas_resource_md5 = '554ccba4617aae7b3b06a62893424c7f'


logger = logging.getLogger(__name__)


def _load_data(data_iter):
    # Get the headers.
    headers = data_iter[0]

    # For some reason this heading is oddly formatted and inconsistent with the
    # rest, or with the usual key-style for dicts.
    data = [{header: val for header, val in zip(headers, line)}
            for line in data_iter[1:]]
    return data



[docs]
def process_from_web(affinity_class_limit=2, named_only=False,
                     standardized_only=False):
    """Return a TasProcessor for the contents of the TAS dump online.

    Interactions are classified into the following classes based on affinity:
      | 1  -- Kd < 100nM
      | 2  -- 100nM < Kd < 1uM
      | 3  -- 1uM < Kd < 10uM
      | 10 -- Kd > 10uM
    By default, only classes 1 and 2 are extracted but the affinity_class_limit
    parameter can be used to change the upper limit of extracted classes.

    Parameters
    ----------
    affinity_class_limit : Optional[int]
        Defines the highest class of binding affinity that is included in the
        extractions. Default: 2
    named_only : Optional[bool]
        If True, only chemicals that have a name assigned in some name space
        (including ones that aren't fully stanadardized per INDRA's ontology,
        e.g., CHEMBL1234) are included. If False, chemicals whose name is
        assigned based on an ID (e.g., CHEMBL)rather than an actual name are
        also included. Default: False
    standardized_only : Optional[bool]
        If True, only chemicals that are fully standardized per INDRA's
        ontology (i.e., they have grounding appearing in one of the
        default_ns_order name spaces, and consequently have any
        groundings and their name standardized) are extracted.
        Default: False

    Returns
    -------
    TasProcessor
        A TasProcessor object which has a list of INDRA Statements extracted
        from the CSV file representing drug-target inhibitions in its
        statements attribute.
    """
    logger.info('Downloading TAS data from %s' % tas_data_url)
    res = requests.get(tas_data_url)
    observed_checksum = md5(res.text.encode('utf-8')).hexdigest()
    logger.info('Verifying md5 checksum of data')
    if tas_resource_md5 != observed_checksum:
        raise RuntimeError('Checksum for downloaded TAS data does not'
                           ' match expected value')
    res.raise_for_status()
    logger.info('Finished downloading TAS data from %s' % tas_data_url)
    data_iter = list(csv.reader(res.text.splitlines(), delimiter=','))
    return TasProcessor(_load_data(data_iter),
                        affinity_class_limit=affinity_class_limit,
                        named_only=named_only,
                        standardized_only=standardized_only)




[docs]
def process_csv(fname, affinity_class_limit=2, named_only=False,
                standardized_only=False):
    """Return a TasProcessor for the contents of a given CSV file..

    Interactions are classified into the following classes based on affinity:
      | 1  -- Kd < 100nM
      | 2  -- 100nM < Kd < 1uM
      | 3  -- 1uM < Kd < 10uM
      | 10 -- Kd > 10uM
    By default, only classes 1 and 2 are extracted but the affinity_class_limit
    parameter can be used to change the upper limit of extracted classes.

    Parameters
    ----------
    fname : str
        The path to a local CSV file containing the TAS data.
    affinity_class_limit : Optional[int]
        Defines the highest class of binding affinity that is included in the
        extractions. Default: 2
    named_only : Optional[bool]
        If True, only chemicals that have a name assigned in some name space
        (including ones that aren't fully stanadardized per INDRA's ontology,
        e.g., CHEMBL1234) are included. If False, chemicals whose name is
        assigned based on an ID (e.g., CHEMBL)rather than an actual name are
        also included. Default: False
    standardized_only : Optional[bool]
        If True, only chemicals that are fully standardized per INDRA's
        ontology (i.e., they have grounding appearing in one of the
        default_ns_order name spaces, and consequently have any
        groundings and their name standardized) are extracted.
        Default: False

    Returns
    -------
    TasProcessor
        A TasProcessor object which has a list of INDRA Statements extracted
        from the CSV file representing drug-target inhibitions in its
        statements attribute.
    """
    data_iter = list(read_unicode_csv(fname))
    return TasProcessor(_load_data(data_iter),
                        affinity_class_limit=affinity_class_limit,
                        named_only=named_only,
                        standardized_only=standardized_only)