Source code for indra.databases.cbio_client

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
import pandas
import logging
import requests
from collections import defaultdict
# Python3
try:
    from functools import lru_cache
    from io import StringIO
# Python2
except ImportError:
    from functools32 import lru_cache
    from StringIO import StringIO


logger = logging.getLogger(__name__)

cbio_url = 'http://www.cbioportal.org/webservice.do'
ccle_study = 'cellline_ccle_broad'


[docs]@lru_cache(maxsize=10000)
def send_request(**kwargs):
    """Return a data frame from a web service request to cBio portal.

    Sends a web service requrest to the cBio portal with arguments given in
    the dictionary data and returns a Pandas data frame on success.

    More information about the service here:
    http://www.cbioportal.org/web_api.jsp

    Parameters
    ----------
    kwargs : dict
        A dict of parameters for the query. Entries map directly to web service
        calls with the exception of the optional 'skiprows' entry, whose value
        is used as the number of rows to skip when reading the result data
        frame.

    Returns
    -------
    df : pandas.DataFrame
        Response from cBioPortal as a Pandas DataFrame.
    """
    skiprows = kwargs.pop('skiprows', None)
    res = requests.get(cbio_url, params=kwargs)
    if res.status_code == 200:
        # Adaptively skip rows based on number of comment lines
        if skiprows == -1:
            lines = res.text.split('\n')
            skiprows = 0
            for line in lines:
                if line.startswith('#'):
                    skiprows += 1
                else:
                    break
        csv_StringIO = StringIO(res.text)
        df = pandas.read_csv(csv_StringIO, sep='\t', skiprows=skiprows)
        return df
    else:
        logger.error('Request returned with code %d' % res.status_code)


[docs]def get_mutations(study_id, gene_list, mutation_type=None,
                  case_id=None):
    """Return mutations as a list of genes and list of amino acid changes.

    Parameters
    ----------
    study_id : str
        The ID of the cBio study.
        Example: 'cellline_ccle_broad' or 'paad_icgc'
    gene_list : list[str]
        A list of genes with their HGNC symbols.
        Example: ['BRAF', 'KRAS']
    mutation_type : Optional[str]
        The type of mutation to filter to.
        mutation_type can be one of: missense, nonsense, frame_shift_ins,
        frame_shift_del, splice_site
    case_id : Optional[str]
        The case ID within the study to filter to.

    Returns
    -------
    mutations : tuple[list]
        A tuple of two lists, the first one containing a list of genes, and
        the second one a list of amino acid changes in those genes.
    """
    genetic_profile = get_genetic_profiles(study_id, 'mutation')[0]
    gene_list_str = ','.join(gene_list)

    data = {'cmd': 'getMutationData',
            'case_set_id': study_id,
            'genetic_profile_id': genetic_profile,
            'gene_list': gene_list_str,
            'skiprows': -1}
    df = send_request(**data)
    if case_id:
        df = df[df['case_id'] == case_id]
    res = _filter_data_frame(df, ['gene_symbol', 'amino_acid_change'],
                             'mutation_type', mutation_type)
    mutations = {'gene_symbol': list(res['gene_symbol'].values()),
                 'amino_acid_change': list(res['amino_acid_change'].values())}
    return mutations


[docs]def get_case_lists(study_id):
    """Return a list of the case set ids for a particular study.

    TAKE NOTE the "case_list_id" are the same thing as "case_set_id"
    Within the data, this string is referred to as a "case_list_id".
    Within API calls it is referred to as a 'case_set_id'.
    The documentation does not make this explicitly clear.

    Parameters
    ----------
    study_id : str
        The ID of the cBio study.
        Example: 'cellline_ccle_broad' or 'paad_icgc'

    Returns
    -------
    case_set_ids : dict[dict[int]]
        A dict keyed to cases containing a dict keyed to genes
        containing int
    """
    data = {'cmd': 'getCaseLists',
            'cancer_study_id': study_id}
    df = send_request(**data)
    case_set_ids = df['case_list_id'].tolist()
    return case_set_ids


[docs]def get_profile_data(study_id, gene_list,
                     profile_filter, case_set_filter=None):
    """Return dict of cases and genes and their respective values.

    Parameters
    ----------
    study_id : str
        The ID of the cBio study.
        Example: 'cellline_ccle_broad' or 'paad_icgc'
    gene_list : list[str]
        A list of genes with their HGNC symbols.
        Example: ['BRAF', 'KRAS']
    profile_filter : str
        A string used to filter the profiles to return. Will be one of:
        - MUTATION
        - MUTATION_EXTENDED
        - COPY_NUMBER_ALTERATION
        - MRNA_EXPRESSION
        - METHYLATION
    case_set_filter : Optional[str]
        A string that specifices which case_set_id to use, based on a complete
        or partial match. If not provided, will look for study_id + '_all'

    Returns
    -------
    profile_data : dict[dict[int]]
        A dict keyed to cases containing a dict keyed to genes
        containing int
    """
    genetic_profiles = get_genetic_profiles(study_id, profile_filter)
    if genetic_profiles:
        genetic_profile = genetic_profiles[0]
    else:
        return {}
    gene_list_str = ','.join(gene_list)
    case_set_ids = get_case_lists(study_id)
    if case_set_filter:
        case_set_id = [x for x in case_set_ids if case_set_filter in x][0]
    else:
        case_set_id = study_id + '_all'
        # based on looking at the cBioPortal, this is a common case_set_id
    data = {'cmd': 'getProfileData',
            'case_set_id': case_set_id,
            'genetic_profile_id': genetic_profile,
            'gene_list': gene_list_str,
            'skiprows': -1}
    df = send_request(**data)
    case_list_df = [x for x in df.columns.tolist()
                    if x not in ['GENE_ID', 'COMMON']]
    profile_data = {case: {g: None for g in gene_list}
                    for case in case_list_df}
    for case in case_list_df:
        profile_values = df[case].tolist()
        df_gene_list = df['COMMON'].tolist()
        for g, cv in zip(df_gene_list, profile_values):
            if not pandas.isnull(cv):
                profile_data[case][g] = cv
    return profile_data


[docs]def get_num_sequenced(study_id):
    """Return number of sequenced tumors for given study.

    This is useful for calculating mutation statistics in terms of the
    prevalence of certain mutations within a type of cancer.

    Parameters
    ----------
    study_id : str
        The ID of the cBio study.
        Example: 'paad_icgc'

    Returns
    -------
    num_case : int
        The number of sequenced tumors in the given study
    """
    data = {'cmd': 'getCaseLists',
            'cancer_study_id': study_id}
    df = send_request(**data)
    if df.empty:
        return 0
    row_filter = df['case_list_id'].str.contains('sequenced', case=False)
    num_case = len(df[row_filter]['case_ids'].tolist()[0].split(' '))
    return num_case


[docs]def get_genetic_profiles(study_id, profile_filter=None):
    """Return all the genetic profiles (data sets) for a given study.

    Genetic profiles are different types of data for a given study. For
    instance the study 'cellline_ccle_broad' has profiles such as
    'cellline_ccle_broad_mutations' for mutations, 'cellline_ccle_broad_CNA'
    for copy number alterations, etc.

    Parameters
    ----------
    study_id : str
        The ID of the cBio study.
        Example: 'paad_icgc'
    profile_filter : Optional[str]
        A string used to filter the profiles to return.
        Will be one of:
        - MUTATION
        - MUTATION_EXTENDED
        - COPY_NUMBER_ALTERATION
        - MRNA_EXPRESSION
        - METHYLATION
        The genetic profiles can include "mutation", "CNA", "rppa",
        "methylation", etc.

    Returns
    -------
    genetic_profiles : list[str]
        A list of genetic profiles available  for the given study.
    """
    data = {'cmd': 'getGeneticProfiles',
            'cancer_study_id': study_id}
    df = send_request(**data)
    res = _filter_data_frame(df, ['genetic_profile_id'],
                             'genetic_alteration_type', profile_filter)
    genetic_profiles = list(res['genetic_profile_id'].values())
    return genetic_profiles


[docs]def get_cancer_studies(study_filter=None):
    """Return a list of cancer study identifiers, optionally filtered.

    There are typically multiple studies for a given type of cancer and
    a filter can be used to constrain the returned list.

    Parameters
    ----------
    study_filter : Optional[str]
        A string used to filter the study IDs to return. Example: "paad"

    Returns
    -------
    study_ids : list[str]
        A list of study IDs.
        For instance "paad" as a filter would result in a list
        of study IDs with paad in their name like "paad_icgc", "paad_tcga",
        etc.
    """
    data = {'cmd': 'getCancerStudies'}
    df = send_request(**data)
    res = _filter_data_frame(df, ['cancer_study_id'],
                             'cancer_study_id', study_filter)
    study_ids = list(res['cancer_study_id'].values())
    return study_ids


[docs]def get_cancer_types(cancer_filter=None):
    """Return a list of cancer types, optionally filtered.

    Parameters
    ----------
    cancer_filter : Optional[str]
        A string used to filter cancer types. Its value is the name or
        part of the name of a type of cancer. Example: "melanoma",
        "pancreatic", "non-small cell lung"

    Returns
    -------
    type_ids : list[str]
        A list of cancer types matching the filter.
        Example: for cancer_filter="pancreatic", the result includes
        "panet" (neuro-endocrine) and "paad" (adenocarcinoma)
    """
    data = {'cmd': 'getTypesOfCancer'}
    df = send_request(**data)
    res = _filter_data_frame(df, ['type_of_cancer_id'], 'name', cancer_filter)
    type_ids = list(res['type_of_cancer_id'].values())
    return type_ids


[docs]def get_ccle_mutations(gene_list, cell_lines, mutation_type=None):
    """Return a dict of mutations in given genes and cell lines from CCLE.

    This is a specialized call to get_mutations tailored to CCLE cell lines.

    Parameters
    ----------
    gene_list : list[str]
        A list of HGNC gene symbols to get mutations in
    cell_lines : list[str]
        A list of CCLE cell line names to get mutations for.
    mutation_type : Optional[str]
        The type of mutation to filter to.
        mutation_type can be one of: missense, nonsense, frame_shift_ins,
        frame_shift_del, splice_site

    Returns
    -------
    mutations : dict
        The result from cBioPortal as a dict in the format
        {cell_line : {gene : [mutation1, mutation2, ...] }}

        Example:
        {'LOXIMVI_SKIN': {'BRAF': ['V600E', 'I208V']},
        'SKMEL30_SKIN': {'BRAF': ['D287H', 'E275K']}}
    """
    mutations = {cl: {g: [] for g in gene_list} for cl in cell_lines}
    for cell_line in cell_lines:
        mutations_cl = get_mutations(ccle_study, gene_list,
                                     mutation_type=mutation_type,
                                     case_id=cell_line)
        for gene, aa_change in zip(mutations_cl['gene_symbol'],
                                   mutations_cl['amino_acid_change']):
            aa_change = str(aa_change)
            mutations[cell_line][gene].append(aa_change)
    return mutations


[docs]def get_ccle_lines_for_mutation(gene, amino_acid_change):
    """Return cell lines with a given point mutation in a given gene.

    Checks which cell lines in CCLE have a particular point mutation
    in a given gene and return their names in a list.

    Parameters
    ----------
    gene : str
        The HGNC symbol of the mutated gene in whose product the amino
        acid change occurs. Example: "BRAF"
    amino_acid_change : str
        The amino acid change of interest. Example: "V600E"

    Returns
    -------
    cell_lines : list
        A list of CCLE cell lines in which the given mutation occurs.
    """
    data = {'cmd': 'getMutationData',
            'case_set_id': ccle_study,
            'genetic_profile_id': ccle_study + '_mutations',
            'gene_list': gene,
            'skiprows': 1}
    df = send_request(**data)
    df = df[df['amino_acid_change'] == amino_acid_change]
    cell_lines = df['case_id'].unique().tolist()
    return cell_lines


[docs]def get_ccle_cna(gene_list, cell_lines):
    """Return a dict of CNAs in given genes and cell lines from CCLE.

    CNA values correspond to the following alterations

    -2 = homozygous deletion

    -1 = hemizygous deletion

    0 = neutral / no change

    1 = gain

    2 = high level amplification

    Parameters
    ----------
    gene_list : list[str]
        A list of HGNC gene symbols to get mutations in
    cell_lines : list[str]
        A list of CCLE cell line names to get mutations for.

    Returns
    -------
    profile_data : dict[dict[int]]
        A dict keyed to cases containing a dict keyed to genes
        containing int
    """
    profile_data = get_profile_data(ccle_study, gene_list,
                                    'COPY_NUMBER_ALTERATION', 'all')
    profile_data = dict((key, value) for key, value in profile_data.items()
                        if key in cell_lines)
    return profile_data


[docs]def get_ccle_mrna(gene_list, cell_lines):
    """Return a dict of mRNA amounts in given genes and cell lines from CCLE.

    Parameters
    ----------
    gene_list : list[str]
        A list of HGNC gene symbols to get mRNA amounts for.
    cell_lines : list[str]
        A list of CCLE cell line names to get mRNA amounts for.

    Returns
    -------
    mrna_amounts : dict[dict[float]]
        A dict keyed to cell lines containing a dict keyed to genes
        containing float
    """
    gene_list_str = ','.join(gene_list)
    data = {'cmd': 'getProfileData',
            'case_set_id': ccle_study + '_mrna',
            'genetic_profile_id': ccle_study + '_mrna',
            'gene_list': gene_list_str,
            'skiprows': -1}
    df = send_request(**data)
    mrna_amounts = {cl: {g: [] for g in gene_list} for cl in cell_lines}
    for cell_line in cell_lines:
        if cell_line in df.columns:
            for gene in gene_list:
                value_cell = df[cell_line][df['COMMON'] == gene]
                if value_cell.empty:
                    mrna_amounts[cell_line][gene] = None
                elif pandas.isnull(value_cell.values[0]):
                    mrna_amounts[cell_line][gene] = None
                else:
                    value = value_cell.values[0]
                    mrna_amounts[cell_line][gene] = value
        else:
            mrna_amounts[cell_line] = None
    return mrna_amounts


def _filter_data_frame(df, data_col, filter_col, filter_str=None):
    """Return a filtered data frame as a dictionary."""
    if filter_str is not None:
        relevant_cols = data_col + [filter_col]
        df.dropna(inplace=True, subset=relevant_cols)
        row_filter = df[filter_col].str.contains(filter_str, case=False)
        data_list = df[row_filter][data_col].to_dict()
    else:
        data_list = df[data_col].to_dict()
    return data_list


# Deactivate this section for the time being, can be reinstated
# once these are fully integrated
'''

def _read_ccle_cna():
    fname = os.path.dirname(os.path.abspath(__file__)) + \
        '/../../data/ccle_CNA.txt'
    try:
        df = pandas.read_csv(fname, sep='\t')
    except Exception:
        df = None
    return df

ccle_cna_df = _read_ccle_cna()


def _read_ccle_mrna():
    fname = os.path.dirname(os.path.abspath(__file__)) + \
        '/../../data/ccle_expression_median.txt'
    try:
        df = pandas.read_csv(fname, sep='\t')
    except Exception:
        df = None
    return df

ccle_mrna_df = _read_ccle_mrna()


def _read_ccle_mutations():
    fname = os.path.dirname(os.path.abspath(__file__)) + \
        '/../../data/ccle_mutations_extended.txt'
    try:
        df = pandas.read_csv(fname, sep='\t', skiprows=2)
    except Exception:
        df = None
    return df

ccle_mutations_df = _read_ccle_mutations()
'''