Source code for indra.databases.cbio_client

"""This is a client for the cBioPortal web service, with
documentation at https://docs.cbioportal.org/web-api-and-clients/
and Swagger definition at https://www.cbioportal.org/api/v2/api-docs.
Note that the client implements direct requests to the API instead of
adding an additional dependency to do so.
"""
__all__ = ["get_mutations", "get_case_lists", "get_profile_data",
           "get_num_sequenced", "get_genetic_profiles",
           "get_cancer_studies", "get_cancer_types", "get_ccle_mutations",
           "get_ccle_lines_for_mutation", "get_ccle_cna",
           "get_ccle_mrna"]

import json
import logging
import requests
from functools import lru_cache
from indra.databases import hgnc_client


logger = logging.getLogger(__name__)

cbio_url = 'https://www.cbioportal.org/api'
ccle_study = 'cellline_ccle_broad'


def send_request(method, endpoint, json_data=None):
    """Return the results of a web service request to cBio portal.

    Sends a web service request to the cBio portal with a specific endpoint,
    method, and JSON data structure, and returns the resulting JSON
    data structure on success.

    More information about the service is available here:
    https://www.cbioportal.org/api/v2/api-docs

    Parameters
    ----------
    method : str
        The HTTP method to use for the request.
        Example: 'get' or 'post'
    endpoint : str
        The endpoint to use for the request.
        Example: 'studies'
    json_data : Optional[Dict]
        The dict-like JSON data structure to send with the request.

    Returns
    -------
    JSON
        The JSON object returned by the web service call.
    """
    json_data_str = json.dumps(json_data) if json_data else None
    res = _send_request_cached(method, endpoint, json_data_str)
    return res


@lru_cache(maxsize=1000)
def _send_request_cached(method, endpoint, json_data_str=None):
    """The actual function running the request, using caching"""
    if endpoint.startswith('/'):
        endpoint = endpoint[1:]
    json_data = json.loads(json_data_str) if json_data_str else {}
    request_fun = getattr(requests, method)
    full_url = cbio_url + '/' + endpoint
    res = request_fun(full_url, json=json_data)
    if res.status_code != 200:
        logger.error(f'Request returned with code {res.status_code}: '
                     f'{res.text}')
        return
    return res.json()


[docs]def get_mutations(study_id, gene_list=None, mutation_type=None, case_id=None): """Return mutations as a list of genes and list of amino acid changes. Parameters ---------- study_id : str The ID of the cBio study. Example: 'cellline_ccle_broad' or 'paad_icgc' gene_list : list[str] A list of genes with their HGNC symbols. Example: ['BRAF', 'KRAS'] mutation_type : Optional[str] The type of mutation to filter to. mutation_type can be one of: missense, nonsense, frame_shift_ins, frame_shift_del, splice_site case_id : Optional[str] The case ID within the study to filter to. Returns ------- mutations : dict A dict with entries for each gene symbol and another list with entries for each corresponding amino acid change. """ genetic_profile = get_genetic_profiles(study_id, 'mutation')[0] entrez_to_gene_symbol = get_entrez_mappings(gene_list) entrez_ids = list(entrez_to_gene_symbol) # Does this need to be parameterized? case_set_id = study_id + '_all' mutations = send_request('post', f'molecular-profiles/{genetic_profile}/' f'mutations/fetch', {'sampleListId': case_set_id, 'entrezGeneIds': entrez_ids}) if case_id: mutations = [m for m in mutations if m['sampleId'] == case_id] if mutation_type: mutations = [m for m in mutations if (mutation_type.casefold() in m['mutationType'].casefold())] mutations_dict = { 'gene_symbol': [entrez_to_gene_symbol[str(m['entrezGeneId'])] for m in mutations], 'amino_acid_change': [m['proteinChange'] for m in mutations], 'sample_id': [m['sampleId'] for m in mutations], } return mutations_dict
def get_entrez_mappings(gene_list): if gene_list: # First we need to get HGNC IDs from HGNC symbols hgnc_mappings = {g: hgnc_client.get_hgnc_id(g) for g in gene_list} # Next, we map from HGNC symbols to Entrez IDs via the hgnc_mappings entrez_mappings = {g: hgnc_client.get_entrez_id(hgnc_mappings[g]) for g in gene_list if hgnc_mappings[g] is not None} # Finally, we reverse the mapping, this will ensure that # we can get the gene symbols back when generating results entrez_to_gene_symbol = {v: k for k, v in entrez_mappings.items() if v is not None and k is not None} else: entrez_to_gene_symbol = {} return entrez_to_gene_symbol
[docs]def get_case_lists(study_id): """Return a list of the case set ids for a particular study. In v2 of the API these are called sample lists. Parameters ---------- study_id : str The ID of the cBio study. Example: 'cellline_ccle_broad' or 'paad_icgc' Returns ------- case_set_ids : list[str] A list of case set IDs, e.g., ['cellline_ccle_broad_all', 'cellline_ccle_broad_cna', ...] """ res = send_request('get', f'studies/{study_id}/sample-lists') return [sl['sampleListId'] for sl in res]
[docs]def get_profile_data(study_id, gene_list, profile_filter, case_set_filter=None): """Return dict of cases and genes and their respective values. Parameters ---------- study_id : str The ID of the cBio study. Example: 'cellline_ccle_broad' or 'paad_icgc' gene_list : list[str] A list of genes with their HGNC symbols. Example: ['BRAF', 'KRAS'] profile_filter : str A string used to filter the profiles to return. Will be one of: - MUTATION - MUTATION_EXTENDED - COPY_NUMBER_ALTERATION - MRNA_EXPRESSION - METHYLATION case_set_filter : Optional[str] A string that specifies which case_set_id to use, based on a complete or partial match. If not provided, will look for study_id + '_all' Returns ------- profile_data : dict[dict[int]] A dict keyed to cases (cell lines if using CCLE) in turn containing a dict keyed by genes, with values corresponding to the given profile (e.g., CNA, mutations). """ genetic_profiles = get_genetic_profiles(study_id, profile_filter) if genetic_profiles: genetic_profile = genetic_profiles[0] else: return {} case_set_ids = get_case_lists(study_id) if case_set_filter: case_set_id = [x for x in case_set_ids if case_set_filter in x][0] else: # based on looking at the cBioPortal, this is a common case_set_id case_set_id = study_id + '_all' entrez_to_gene_symbol = get_entrez_mappings(gene_list) entrez_ids = list(entrez_to_gene_symbol) res = send_request('post', f'molecular-profiles/{genetic_profile}/' f'molecular-data/fetch', {'sampleListId': case_set_id, 'entrezGeneIds': entrez_ids}) profile_data = {} # Each entry in the results contains something like # {'entrezGeneId': 673, 'molecularProfileId': 'cellline_ccle_broad_cna', # 'sampleId': '1321N1_CENTRAL_NERVOUS_SYSTEM', # 'studyId': 'cellline_ccle_broad', 'value': 1, ...} for sample in res: sample_id = sample['sampleId'] if sample_id not in profile_data: profile_data[sample_id] = {} gene_symbol = entrez_to_gene_symbol[str(sample['entrezGeneId'])] profile_data[sample_id][gene_symbol] = sample['value'] return profile_data
[docs]def get_num_sequenced(study_id): """Return number of sequenced tumors for given study. This is useful for calculating mutation statistics in terms of the prevalence of certain mutations within a type of cancer. Parameters ---------- study_id : str The ID of the cBio study. Example: 'paad_icgc' Returns ------- num_case : int The number of sequenced tumors in the given study """ # First we get all the case lists for the study case_lists = get_case_lists(study_id) # Then we find ones that have 'sequenced' in the name sequencing_case_list = [cl for cl in case_lists if 'sequenced' in cl] # Then we look at the sample IDs and count them cases = set() for cl in sequencing_case_list: res = send_request('get', f'/sample-lists/{cl}/sample-ids') cases |= set(res) num_case = len(cases) return num_case
[docs]def get_genetic_profiles(study_id, profile_filter=None): """Return all the genetic profiles (data sets) for a given study. Genetic profiles are different types of data for a given study. For instance the study 'cellline_ccle_broad' has profiles such as 'cellline_ccle_broad_mutations' for mutations, 'cellline_ccle_broad_CNA' for copy number alterations, etc. NOTE: In the v2 API, the genetic profiles are called molecular profiles. Parameters ---------- study_id : str The ID of the cBio study. Example: 'paad_icgc' profile_filter : Optional[str] A string used to filter the profiles to return. Will be one of: - MUTATION - MUTATION_EXTENDED - COPY_NUMBER_ALTERATION - MRNA_EXPRESSION - METHYLATION The genetic profiles can include "mutation", "CNA", "rppa", "methylation", etc. The filter is case insensitive. Returns ------- genetic_profiles : list[str] A list of genetic profiles available for the given study. """ res = send_request('get', f'studies/{study_id}/molecular-profiles') if profile_filter: res = [prof for prof in res if (profile_filter.casefold() in prof['molecularAlterationType'].casefold())] profile_ids = [prof['molecularProfileId'] for prof in res] return profile_ids
[docs]def get_cancer_studies(study_filter=None): """Return a list of cancer study identifiers, optionally filtered. There are typically multiple studies for a given type of cancer and a filter can be used to constrain the returned list. Parameters ---------- study_filter : Optional[str] A string used to filter the study IDs to return. Example: "paad" Returns ------- study_ids : list[str] A list of study IDs. For instance "paad" as a filter would result in a list of study IDs with paad in their name like "paad_icgc", "paad_tcga", etc. """ studies = send_request('get', 'studies') if study_filter: studies = [s for s in studies if study_filter.casefold() in s['studyId'].casefold()] study_ids = [s['studyId'] for s in studies] return study_ids
[docs]def get_cancer_types(cancer_filter=None): """Return a list of cancer types, optionally filtered. Parameters ---------- cancer_filter : Optional[str] A string used to filter cancer types. Its value is the name or part of the name of a type of cancer. Example: "melanoma", "pancreatic", "non-small cell lung" Returns ------- type_ids : list[str] A list of cancer types matching the filter. Example: for cancer_filter="pancreatic", the result includes "panet" (neuro-endocrine) and "paad" (adenocarcinoma) """ cancer_types = send_request('get', 'cancer-types') if cancer_filter: cancer_types = [c for c in cancer_types if cancer_filter.casefold() in c['name'].casefold()] type_ids = [c['cancerTypeId'] for c in cancer_types] return type_ids
[docs]def get_ccle_mutations(gene_list, cell_lines, mutation_type=None): """Return a dict of mutations in given genes and cell lines from CCLE. This is a specialized call to get_mutations tailored to CCLE cell lines. Parameters ---------- gene_list : list[str] A list of HGNC gene symbols to get mutations in cell_lines : list[str] A list of CCLE cell line names to get mutations for. mutation_type : Optional[str] The type of mutation to filter to. mutation_type can be one of: missense, nonsense, frame_shift_ins, frame_shift_del, splice_site Returns ------- mutations : dict The result from cBioPortal as a dict in the format {cell_line : {gene : [mutation1, mutation2, ...] }} Example: {'LOXIMVI_SKIN': {'BRAF': ['V600E', 'I208V']}, 'SKMEL30_SKIN': {'BRAF': ['D287H', 'E275K']}} """ mutations = {cl: {g: [] for g in gene_list} for cl in cell_lines} for cell_line in cell_lines: mutations_cl = get_mutations(ccle_study, gene_list, mutation_type=mutation_type, case_id=cell_line) for gene, aa_change in zip(mutations_cl['gene_symbol'], mutations_cl['amino_acid_change']): aa_change = str(aa_change) mutations[cell_line][gene].append(aa_change) return mutations
[docs]def get_ccle_lines_for_mutation(gene, amino_acid_change): """Return cell lines with a given point mutation in a given gene. Checks which cell lines in CCLE have a particular point mutation in a given gene and return their names in a list. Parameters ---------- gene : str The HGNC symbol of the mutated gene in whose product the amino acid change occurs. Example: "BRAF" amino_acid_change : str The amino acid change of interest. Example: "V600E" Returns ------- cell_lines : list A list of CCLE cell lines in which the given mutation occurs. """ mutations = get_mutations(ccle_study, [gene], 'missense') cell_lines = {cl for aac, cl in zip(mutations['amino_acid_change'], mutations['sample_id']) if aac == amino_acid_change} return sorted(cell_lines)
[docs]def get_ccle_cna(gene_list, cell_lines=None): """Return a dict of CNAs in given genes and cell lines from CCLE. CNA values correspond to the following alterations -2 = homozygous deletion -1 = hemizygous deletion 0 = neutral / no change 1 = gain 2 = high level amplification Parameters ---------- gene_list : list[str] A list of HGNC gene symbols to get mutations in cell_lines : Optional[list[str]] A list of CCLE cell line names to get mutations for. Returns ------- profile_data : dict[dict[int]] A dict keyed to cases containing a dict keyed to genes containing int """ profile_data = get_profile_data(ccle_study, gene_list, 'COPY_NUMBER_ALTERATION', 'all') return {cell_line: value for cell_line, value in profile_data.items() if cell_lines is None or cell_line in cell_lines}
[docs]def get_ccle_mrna(gene_list, cell_lines=None): """Return a dict of mRNA amounts in given genes and cell lines from CCLE. Parameters ---------- gene_list : list[str] A list of HGNC gene symbols to get mRNA amounts for. cell_lines : Optional[list[str]] A list of CCLE cell line names to get mRNA amounts for. Returns ------- mrna_amounts : dict[dict[float]] A dict keyed to cell lines containing a dict keyed to genes containing float """ profile_data = get_profile_data(ccle_study, gene_list, 'MRNA_EXPRESSION', 'all') mrna_amounts = {cell_line: value for cell_line, value in profile_data.items() if cell_lines is None or cell_line in cell_lines} # This is to make sure that if cell_lines were specified then # we return None if there is no data for a given cell line # This matches the old behavior of the function if cell_lines: for cell_line in cell_lines: if cell_line not in mrna_amounts: mrna_amounts[cell_line] = None else: for gene in gene_list: if gene not in mrna_amounts[cell_line]: mrna_amounts[cell_line][gene] = None return mrna_amounts