Source code for indra.databases.hgnc_client

import os
import re
import logging
from collections import defaultdict

import requests
from typing import Set, Union
import xml.etree.ElementTree as ET
from functools import lru_cache

from indra.util import read_unicode_csv, UnicodeXMLTreeBuilder as UTB
from indra.resources import get_resource_path

logger = logging.getLogger(__name__)


hgnc_url = 'http://rest.genenames.org/fetch/'


[docs]def get_uniprot_id(hgnc_id):
    """Return the UniProt ID corresponding to the given HGNC ID.

    Parameters
    ----------
    hgnc_id : str
        The HGNC ID to be converted. Note that the HGNC ID is a number that is
        passed as a string. It is not the same as the HGNC gene symbol.

    Returns
    -------
    uniprot_id : str
        The UniProt ID corresponding to the given HGNC ID.
    """
    uniprot_id = uniprot_ids.get(hgnc_id)
    # The lookup can yield an empty string. Instead return None.
    if not uniprot_id:
        return None
    return uniprot_id


[docs]def get_entrez_id(hgnc_id):
    """Return the Entrez ID corresponding to the given HGNC ID.

    Parameters
    ----------
    hgnc_id : str
        The HGNC ID to be converted. Note that the HGNC ID is a number that is
        passed as a string. It is not the same as the HGNC gene symbol.

    Returns
    -------
    entrez_id : str
        The Entrez ID corresponding to the given HGNC ID.
    """
    entrez_id = entrez_ids.get(hgnc_id)
    # The lookup can yield an empty string. Instead return None.
    if not entrez_id:
        return None
    return entrez_id


[docs]def get_hgnc_from_entrez(entrez_id):
    """Return the HGNC ID corresponding to the given Entrez ID.

    Parameters
    ----------
    entrez_id : str
        The Entrez ID to be converted, a number passed as a string.

    Returns
    -------
    hgnc_id : str
        The HGNC ID corresponding to the given Entrez ID.
    """
    hgnc_id = entrez_ids_reverse.get(entrez_id)
    return hgnc_id


[docs]def get_ensembl_id(hgnc_id):
    """Return the Ensembl ID corresponding to the given HGNC ID.

    Parameters
    ----------
    hgnc_id : str
        The HGNC ID to be converted. Note that the HGNC ID is a number that is
        passed as a string. It is not the same as the HGNC gene symbol.

    Returns
    -------
    ensembl_id : str
        The Ensembl ID corresponding to the given HGNC ID.
    """
    return ensembl_ids.get(hgnc_id)


[docs]def get_hgnc_from_ensembl(ensembl_id):
    """Return the HGNC ID corresponding to the given Ensembl ID.

    Parameters
    ----------
    ensembl_id : str
        The Ensembl ID to be converted, a number passed as a string.

    Returns
    -------
    hgnc_id : str
        The HGNC ID corresponding to the given Ensembl ID.
    """
    return ensembl_ids_reverse.get(ensembl_id)


[docs]def get_hgnc_name(hgnc_id):
    """Return the HGNC symbol corresponding to the given HGNC ID.

    Parameters
    ----------
    hgnc_id : str
        The HGNC ID to be converted.

    Returns
    -------
    hgnc_name : str
        The HGNC symbol corresponding to the given HGNC ID.
    """
    try:
        hgnc_name = hgnc_names[hgnc_id]
    except KeyError:
        xml_tree = get_hgnc_entry(hgnc_id)
        if xml_tree is None:
            return None
        hgnc_name_tag =\
            xml_tree.find("result/doc/str[@name='symbol']")
        if hgnc_name_tag is None:
            return None
        hgnc_name = hgnc_name_tag.text.strip()
    return hgnc_name


[docs]def get_hgnc_id(hgnc_name):
    """Return the HGNC ID corresponding to the given HGNC symbol.

    Parameters
    ----------
    hgnc_name : str
        The HGNC symbol to be converted. Example: BRAF

    Returns
    -------
    hgnc_id : str
        The HGNC ID corresponding to the given HGNC symbol.
    """
    return hgnc_ids.get(hgnc_name)


[docs]def get_current_hgnc_id(hgnc_name):
    """Return HGNC ID(s) corresponding to a current or outdated HGNC symbol.

    Parameters
    ----------
    hgnc_name : str
        The HGNC symbol to be converted, possibly an outdated symbol.

    Returns
    -------
    str or list of str or None
        If there is a single HGNC ID corresponding to the given current or
        outdated HGNC symbol, that ID is returned as a string. If the symbol
        is outdated and maps to multiple current IDs, a list of these
        IDs is returned. If the given name doesn't correspond to either
        a current or an outdated HGNC symbol, None is returned.
    """
    hgnc_id = get_hgnc_id(hgnc_name)
    if hgnc_id:
        return hgnc_id
    hgnc_id = prev_sym_map.get(hgnc_name)
    return hgnc_id


[docs]def get_hgnc_from_mouse(mgi_id):
    """Return the HGNC ID corresponding to the given MGI mouse gene ID.

    Parameters
    ----------
    mgi_id : str
        The MGI ID to be converted. Example: "2444934"

    Returns
    -------
    hgnc_id : str
        The HGNC ID corresponding to the given MGI ID.
    """
    if mgi_id and mgi_id.startswith('MGI:'):
        mgi_id = mgi_id[4:]
    return mouse_map.get(mgi_id)


[docs]def get_hgnc_from_rat(rgd_id):
    """Return the HGNC ID corresponding to the given RGD rat gene ID.

    Parameters
    ----------
    rgd_id : str
        The RGD ID to be converted. Example: "1564928"

    Returns
    -------
    hgnc_id : str
        The HGNC ID corresponding to the given RGD ID.
    """
    if rgd_id and rgd_id.startswith('RGD:'):
        rgd_id = rgd_id[4:]
    return rat_map.get(rgd_id)


[docs]def get_rat_id(hgnc_id):
    """Return the RGD rat ID corresponding to the given HGNC ID.

    Parameters
    ----------
    hgnc_id : str
        The HGNC ID to be converted. Example: "1097"

    Returns
    -------
    rgd_id : str
        The RGD ID corresponding to the given HGNC ID.
    """
    for k, v in rat_map.items():
        if v == hgnc_id:
            return k


[docs]def get_mouse_id(hgnc_id):
    """Return the MGI mouse ID corresponding to the given HGNC ID.

    Parameters
    ----------
    hgnc_id : str
        The HGNC ID to be converted. Example: "1097"

    Returns
    -------
    mgi_id : str
        The MGI ID corresponding to the given HGNC ID.
    """
    for k, v in mouse_map.items():
        if v == hgnc_id:
            return k


[docs]@lru_cache(maxsize=1000)
def get_hgnc_entry(hgnc_id):
    """Return the HGNC entry for the given HGNC ID from the web service.

    Parameters
    ----------
    hgnc_id : str
        The HGNC ID to be converted.

    Returns
    -------
    xml_tree : ElementTree
        The XML ElementTree corresponding to the entry for the
        given HGNC ID.
    """
    url = hgnc_url + 'hgnc_id/%s' % hgnc_id
    headers = {'Accept': '*/*'}
    res = requests.get(url, headers=headers)
    if not res.status_code == 200:
        return None
    xml_tree = ET.XML(res.content, parser=UTB())
    return xml_tree


[docs]def get_gene_type(hgnc_id: str) -> Union[str, None]:
    """Return the locus type of the genve with the given HGNC ID.

    See more under Locus type at
    https://www.genenames.org/help/symbol-report/#!/#tocAnchor-1-2

    Parameters
    ----------
    hgnc_id :
        The HGNC ID of the gene to get the locus type of.

    Returns
    -------
    :
        The locus type of the given gene.
    """
    return gene_type.get(hgnc_id)


[docs]def is_kinase(gene_name):
    """Return True if the given gene name is a kinase.

    Parameters
    ----------
    gene_name : str
        The HGNC gene symbol corresponding to the protein.

    Returns
    -------
    bool
        True if the given gene name corresponds to a kinase, False otherwise.
    """
    return gene_name in kinases


[docs]def is_transcription_factor(gene_name):
    """Return True if the given gene name is a transcription factor.

    Parameters
    ----------
    gene_name : str
        The HGNC gene symbol corresponding to the protein.

    Returns
    -------
    bool
        True if the given gene name corresponds to a transcription factor,
        False otherwise.
    """
    return gene_name in tfs


[docs]def is_phosphatase(gene_name):
    """Return True if the given gene name is a phosphatase.

    Parameters
    ----------
    gene_name : str
        The HGNC gene symbol corresponding to the protein.

    Returns
    -------
    bool
        True if the given gene name corresponds to a phosphatase,
        False otherwise.
    """
    return gene_name in phosphatases


[docs]def get_enzymes(hgnc_id: str) -> Set[str]:
    """Return the EC codes corresponding to the given HGNC ID.

    Parameters
    ----------
    hgnc_id :
        The HGNC ID to be converted.

    Returns
    -------
    :
        A set of EC codes
    """
    return hgnc_to_enzymes.get(hgnc_id, set())


[docs]def get_hgncs_from_enzyme(ec_code: str) -> Set[str]:
    """Return the HGNC ids associated with a given enzyme.

    Parameters
    ----------
    ec_code :
        The EC code (e.g., 2.4.1.228)

    Returns
    -------
    :
        A set of HGNC identifiers
    """
    return enzyme_to_hgncs.get(ec_code, set())


[docs]def get_hgnc_id_from_mgi_name(mgi_name: str) -> Union[str, None]:
    """Return a HGNC ID for the human gene homologous to the given mouse gene.

    The mouse gene name provided as input is assumed to be an MGI
    official symbol.

    Parameters
    ----------
    mgi_name :
        The MGI symbol of a mouse gene.

    Returns
    -------
    :
        The HGNC ID of the corresponding human gene or None if not available.
    """
    from indra.databases import mgi_client
    mgi_id = mgi_client.get_id_from_name(mgi_name)
    if mgi_id:
        return get_hgnc_from_mouse(mgi_id)
    return None


[docs]def get_hgnc_name_from_mgi_name(mgi_name: str) -> Union[str, None]:
    """Return a HGNC name for the human gene homologous to the given mouse gene.

    The mouse gene name provided as input is assumed to be an MGI
    official symbol.

    Parameters
    ----------
    mgi_name :
        The MGI symbol of a mouse gene.

    Returns
    -------
    :
        The HGNC symbol of the corresponding human gene or None if not
        available.
    """
    hgnc_id = get_hgnc_id_from_mgi_name(mgi_name)
    if hgnc_id:
        return get_hgnc_name(hgnc_id)
    return None


def _read_hgnc_maps():
    hgnc_file = get_resource_path("hgnc_entries.tsv")
    csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8')
    hgnc_names = {}
    hgnc_ids = {}
    hgnc_withdrawn = []
    uniprot_ids = {}
    entrez_ids = {}
    entrez_ids_reverse = {}
    mouse_map = {}
    rat_map = {}
    prev_sym_map = {}
    ensembl_ids = {}
    ensembl_ids_reverse = {}
    hgnc_withdrawn_new_ids = {}
    gene_types = {}
    hgnc_to_enzymes = defaultdict(set)
    enzyme_to_hgncs = defaultdict(set)
    # Skip the header
    next(csv_rows)
    for row in csv_rows:
        hgnc_id = row[0][5:]
        hgnc_status = row[3]
        if hgnc_status in {'Approved', 'Entry Withdrawn'}:
            hgnc_name = row[1]
            hgnc_names[hgnc_id] = hgnc_name
            # Note that withdrawn entries don't overlap with approved
            # entries at this point so it's safe to add mappings for
            # withdrawn names
            hgnc_ids[hgnc_name] = hgnc_id
        elif hgnc_status == 'Symbol Withdrawn':
            descr = row[2]
            m = re.match(r'symbol withdrawn, see \[HGNC:(?: ?)(\d+)\]', descr)
            new_id = m.groups()[0]
            hgnc_withdrawn.append(hgnc_id)
            hgnc_withdrawn_new_ids[hgnc_id] = new_id
        # Uniprot
        uniprot_id = row[6]
        if uniprot_id:
            uniprot_ids[hgnc_id] = uniprot_id
        # Entrez
        entrez_id = row[5]
        if entrez_id:
            entrez_ids[hgnc_id] = entrez_id
            entrez_ids_reverse[entrez_id] = hgnc_id
        # Mouse
        mgi_id = row[7]
        if mgi_id:
            mgi_ids = mgi_id.split(', ')
            for mgi_id in mgi_ids:
                if mgi_id.startswith('MGI:'):
                    mgi_id = mgi_id[4:]
                mouse_map[mgi_id] = hgnc_id
        # Rat
        rgd_id = row[8]
        if rgd_id:
            rgd_ids = rgd_id.split(', ')
            for rgd_id in rgd_ids:
                if rgd_id.startswith('RGD:'):
                    rgd_id = rgd_id[4:]
                rat_map[rgd_id] = hgnc_id
        # Previous symbols
        prev_sym_entry = row[9]
        if prev_sym_entry:
            prev_syms = prev_sym_entry.split(', ')
            for prev_sym in prev_syms:
                # If we already mapped this previous symbol to another ID
                if prev_sym in prev_sym_map:
                    # If we already have a list here, we just extend it
                    if isinstance(prev_sym_map[prev_sym], list):
                        prev_sym_map[prev_sym].append(hgnc_id)
                    # Otherwise we create a list and start it with the two
                    # IDs we know the symbol is mapped to
                    else:
                        prev_sym_map[prev_sym] = [prev_sym_map[prev_sym],
                                                  hgnc_id]
                # Otherwise we just make a string entry here
                else:
                    prev_sym_map[prev_sym] = hgnc_id
        ensembl_id = row[10]
        # Ensembl IDs
        if ensembl_id:
            ensembl_ids[hgnc_id] = ensembl_id
            ensembl_ids_reverse[ensembl_id] = hgnc_id
        gene_type = row[11]
        if gene_type:
            gene_types[hgnc_id] = gene_type
        enyzyme_ids = row[12]
        if enyzyme_ids:
            for enzyme_id in enyzyme_ids.split(", "):
                hgnc_to_enzymes[hgnc_id].add(enzyme_id)
                enzyme_to_hgncs[enzyme_id].add(hgnc_id)

    for old_id, new_id in hgnc_withdrawn_new_ids.items():
        hgnc_names[old_id] = hgnc_names[new_id]

    return (
        hgnc_names, hgnc_ids, hgnc_withdrawn,
        uniprot_ids, entrez_ids, entrez_ids_reverse, mouse_map, rat_map,
        prev_sym_map, ensembl_ids, ensembl_ids_reverse, gene_types,
        dict(hgnc_to_enzymes), dict(enzyme_to_hgncs),
    )


(
    hgnc_names, hgnc_ids, hgnc_withdrawn, uniprot_ids, entrez_ids,
    entrez_ids_reverse, mouse_map, rat_map, prev_sym_map, ensembl_ids,
    ensembl_ids_reverse, gene_type,
    hgnc_to_enzymes, enzyme_to_hgncs,
) = _read_hgnc_maps()


def _read_kinases():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         'resources', 'kinases.tsv')
    kinase_table = read_unicode_csv(fname, delimiter='\t')
    gene_names = [lin[1] for lin in list(kinase_table)[1:]]
    return gene_names


def _read_phosphatases():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         'resources', 'phosphatases.tsv')
    p_table = read_unicode_csv(fname, delimiter='\t')
    # First column is phosphatase names
    # Second column is HGNC ids
    p_names = [row[0] for row in p_table]
    return p_names


def _read_tfs():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         'resources', 'transcription_factors.csv')
    tf_table = read_unicode_csv(fname)
    gene_names = [lin[1] for lin in list(tf_table)[1:]]
    return gene_names


kinases, phosphatases, tfs = _read_kinases(), _read_phosphatases(), _read_tfs()