Source code for indra.databases.hgnc_client

import os
import re
import logging
from collections import defaultdict

import requests
from typing import Set, Union
import xml.etree.ElementTree as ET
from functools import lru_cache

from indra.util import read_unicode_csv, UnicodeXMLTreeBuilder as UTB
from indra.resources import get_resource_path

logger = logging.getLogger(__name__)


hgnc_url = 'http://rest.genenames.org/fetch/'


[docs]def get_uniprot_id(hgnc_id): """Return the UniProt ID corresponding to the given HGNC ID. Parameters ---------- hgnc_id : str The HGNC ID to be converted. Note that the HGNC ID is a number that is passed as a string. It is not the same as the HGNC gene symbol. Returns ------- uniprot_id : str The UniProt ID corresponding to the given HGNC ID. """ uniprot_id = uniprot_ids.get(hgnc_id) # The lookup can yield an empty string. Instead return None. if not uniprot_id: return None return uniprot_id
[docs]def get_entrez_id(hgnc_id): """Return the Entrez ID corresponding to the given HGNC ID. Parameters ---------- hgnc_id : str The HGNC ID to be converted. Note that the HGNC ID is a number that is passed as a string. It is not the same as the HGNC gene symbol. Returns ------- entrez_id : str The Entrez ID corresponding to the given HGNC ID. """ entrez_id = entrez_ids.get(hgnc_id) # The lookup can yield an empty string. Instead return None. if not entrez_id: return None return entrez_id
[docs]def get_hgnc_from_entrez(entrez_id): """Return the HGNC ID corresponding to the given Entrez ID. Parameters ---------- entrez_id : str The Entrez ID to be converted, a number passed as a string. Returns ------- hgnc_id : str The HGNC ID corresponding to the given Entrez ID. """ hgnc_id = entrez_ids_reverse.get(entrez_id) return hgnc_id
[docs]def get_ensembl_id(hgnc_id): """Return the Ensembl ID corresponding to the given HGNC ID. Parameters ---------- hgnc_id : str The HGNC ID to be converted. Note that the HGNC ID is a number that is passed as a string. It is not the same as the HGNC gene symbol. Returns ------- ensembl_id : str The Ensembl ID corresponding to the given HGNC ID. """ return ensembl_ids.get(hgnc_id)
[docs]def get_hgnc_from_ensembl(ensembl_id): """Return the HGNC ID corresponding to the given Ensembl ID. Parameters ---------- ensembl_id : str The Ensembl ID to be converted, a number passed as a string. Returns ------- hgnc_id : str The HGNC ID corresponding to the given Ensembl ID. """ return ensembl_ids_reverse.get(ensembl_id)
[docs]def get_hgnc_name(hgnc_id): """Return the HGNC symbol corresponding to the given HGNC ID. Parameters ---------- hgnc_id : str The HGNC ID to be converted. Returns ------- hgnc_name : str The HGNC symbol corresponding to the given HGNC ID. """ try: hgnc_name = hgnc_names[hgnc_id] except KeyError: xml_tree = get_hgnc_entry(hgnc_id) if xml_tree is None: return None hgnc_name_tag =\ xml_tree.find("result/doc/str[@name='symbol']") if hgnc_name_tag is None: return None hgnc_name = hgnc_name_tag.text.strip() return hgnc_name
[docs]def get_hgnc_id(hgnc_name): """Return the HGNC ID corresponding to the given HGNC symbol. Parameters ---------- hgnc_name : str The HGNC symbol to be converted. Example: BRAF Returns ------- hgnc_id : str The HGNC ID corresponding to the given HGNC symbol. """ return hgnc_ids.get(hgnc_name)
[docs]def get_current_hgnc_id(hgnc_name): """Return HGNC ID(s) corresponding to a current or outdated HGNC symbol. Parameters ---------- hgnc_name : str The HGNC symbol to be converted, possibly an outdated symbol. Returns ------- str or list of str or None If there is a single HGNC ID corresponding to the given current or outdated HGNC symbol, that ID is returned as a string. If the symbol is outdated and maps to multiple current IDs, a list of these IDs is returned. If the given name doesn't correspond to either a current or an outdated HGNC symbol, None is returned. """ hgnc_id = get_hgnc_id(hgnc_name) if hgnc_id: return hgnc_id hgnc_id = prev_sym_map.get(hgnc_name) return hgnc_id
[docs]def get_hgnc_from_mouse(mgi_id): """Return the HGNC ID corresponding to the given MGI mouse gene ID. Parameters ---------- mgi_id : str The MGI ID to be converted. Example: "2444934" Returns ------- hgnc_id : str The HGNC ID corresponding to the given MGI ID. """ if mgi_id and mgi_id.startswith('MGI:'): mgi_id = mgi_id[4:] return mouse_map.get(mgi_id)
[docs]def get_hgnc_from_rat(rgd_id): """Return the HGNC ID corresponding to the given RGD rat gene ID. Parameters ---------- rgd_id : str The RGD ID to be converted. Example: "1564928" Returns ------- hgnc_id : str The HGNC ID corresponding to the given RGD ID. """ if rgd_id and rgd_id.startswith('RGD:'): rgd_id = rgd_id[4:] return rat_map.get(rgd_id)
[docs]def get_rat_id(hgnc_id): """Return the RGD rat ID corresponding to the given HGNC ID. Parameters ---------- hgnc_id : str The HGNC ID to be converted. Example: "1097" Returns ------- rgd_id : str The RGD ID corresponding to the given HGNC ID. """ for k, v in rat_map.items(): if v == hgnc_id: return k
[docs]def get_mouse_id(hgnc_id): """Return the MGI mouse ID corresponding to the given HGNC ID. Parameters ---------- hgnc_id : str The HGNC ID to be converted. Example: "1097" Returns ------- mgi_id : str The MGI ID corresponding to the given HGNC ID. """ for k, v in mouse_map.items(): if v == hgnc_id: return k
[docs]@lru_cache(maxsize=1000) def get_hgnc_entry(hgnc_id): """Return the HGNC entry for the given HGNC ID from the web service. Parameters ---------- hgnc_id : str The HGNC ID to be converted. Returns ------- xml_tree : ElementTree The XML ElementTree corresponding to the entry for the given HGNC ID. """ url = hgnc_url + 'hgnc_id/%s' % hgnc_id headers = {'Accept': '*/*'} res = requests.get(url, headers=headers) if not res.status_code == 200: return None xml_tree = ET.XML(res.content, parser=UTB()) return xml_tree
[docs]def get_gene_type(hgnc_id: str) -> Union[str, None]: """Return the locus type of the genve with the given HGNC ID. See more under Locus type at https://www.genenames.org/help/symbol-report/#!/#tocAnchor-1-2 Parameters ---------- hgnc_id : The HGNC ID of the gene to get the locus type of. Returns ------- : The locus type of the given gene. """ return gene_type.get(hgnc_id)
[docs]def is_kinase(gene_name): """Return True if the given gene name is a kinase. Parameters ---------- gene_name : str The HGNC gene symbol corresponding to the protein. Returns ------- bool True if the given gene name corresponds to a kinase, False otherwise. """ return gene_name in kinases
[docs]def is_transcription_factor(gene_name): """Return True if the given gene name is a transcription factor. Parameters ---------- gene_name : str The HGNC gene symbol corresponding to the protein. Returns ------- bool True if the given gene name corresponds to a transcription factor, False otherwise. """ return gene_name in tfs
[docs]def is_phosphatase(gene_name): """Return True if the given gene name is a phosphatase. Parameters ---------- gene_name : str The HGNC gene symbol corresponding to the protein. Returns ------- bool True if the given gene name corresponds to a phosphatase, False otherwise. """ return gene_name in phosphatases
[docs]def get_enzymes(hgnc_id: str) -> Set[str]: """Return the EC codes corresponding to the given HGNC ID. Parameters ---------- hgnc_id : The HGNC ID to be converted. Returns ------- : A set of EC codes """ return hgnc_to_enzymes.get(hgnc_id, set())
[docs]def get_hgncs_from_enzyme(ec_code: str) -> Set[str]: """Return the HGNC ids associated with a given enzyme. Parameters ---------- ec_code : The EC code (e.g., 2.4.1.228) Returns ------- : A set of HGNC identifiers """ return enzyme_to_hgncs.get(ec_code, set())
[docs]def get_hgnc_id_from_mgi_name(mgi_name: str) -> Union[str, None]: """Return a HGNC ID for the human gene homologous to the given mouse gene. The mouse gene name provided as input is assumed to be an MGI official symbol. Parameters ---------- mgi_name : The MGI symbol of a mouse gene. Returns ------- : The HGNC ID of the corresponding human gene or None if not available. """ from indra.databases import mgi_client mgi_id = mgi_client.get_id_from_name(mgi_name) if mgi_id: return get_hgnc_from_mouse(mgi_id) return None
[docs]def get_hgnc_name_from_mgi_name(mgi_name: str) -> Union[str, None]: """Return a HGNC name for the human gene homologous to the given mouse gene. The mouse gene name provided as input is assumed to be an MGI official symbol. Parameters ---------- mgi_name : The MGI symbol of a mouse gene. Returns ------- : The HGNC symbol of the corresponding human gene or None if not available. """ hgnc_id = get_hgnc_id_from_mgi_name(mgi_name) if hgnc_id: return get_hgnc_name(hgnc_id) return None
def _read_hgnc_maps(): hgnc_file = get_resource_path("hgnc_entries.tsv") csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8') hgnc_names = {} hgnc_ids = {} hgnc_withdrawn = [] uniprot_ids = {} entrez_ids = {} entrez_ids_reverse = {} mouse_map = {} rat_map = {} prev_sym_map = {} ensembl_ids = {} ensembl_ids_reverse = {} hgnc_withdrawn_new_ids = {} gene_types = {} hgnc_to_enzymes = defaultdict(set) enzyme_to_hgncs = defaultdict(set) # Skip the header next(csv_rows) for row in csv_rows: hgnc_id = row[0][5:] hgnc_status = row[3] if hgnc_status in {'Approved', 'Entry Withdrawn'}: hgnc_name = row[1] hgnc_names[hgnc_id] = hgnc_name # Note that withdrawn entries don't overlap with approved # entries at this point so it's safe to add mappings for # withdrawn names hgnc_ids[hgnc_name] = hgnc_id elif hgnc_status == 'Symbol Withdrawn': descr = row[2] m = re.match(r'symbol withdrawn, see \[HGNC:(?: ?)(\d+)\]', descr) new_id = m.groups()[0] hgnc_withdrawn.append(hgnc_id) hgnc_withdrawn_new_ids[hgnc_id] = new_id # Uniprot uniprot_id = row[6] if uniprot_id: uniprot_ids[hgnc_id] = uniprot_id # Entrez entrez_id = row[5] if entrez_id: entrez_ids[hgnc_id] = entrez_id entrez_ids_reverse[entrez_id] = hgnc_id # Mouse mgi_id = row[7] if mgi_id: mgi_ids = mgi_id.split(', ') for mgi_id in mgi_ids: if mgi_id.startswith('MGI:'): mgi_id = mgi_id[4:] mouse_map[mgi_id] = hgnc_id # Rat rgd_id = row[8] if rgd_id: rgd_ids = rgd_id.split(', ') for rgd_id in rgd_ids: if rgd_id.startswith('RGD:'): rgd_id = rgd_id[4:] rat_map[rgd_id] = hgnc_id # Previous symbols prev_sym_entry = row[9] if prev_sym_entry: prev_syms = prev_sym_entry.split(', ') for prev_sym in prev_syms: # If we already mapped this previous symbol to another ID if prev_sym in prev_sym_map: # If we already have a list here, we just extend it if isinstance(prev_sym_map[prev_sym], list): prev_sym_map[prev_sym].append(hgnc_id) # Otherwise we create a list and start it with the two # IDs we know the symbol is mapped to else: prev_sym_map[prev_sym] = [prev_sym_map[prev_sym], hgnc_id] # Otherwise we just make a string entry here else: prev_sym_map[prev_sym] = hgnc_id ensembl_id = row[10] # Ensembl IDs if ensembl_id: ensembl_ids[hgnc_id] = ensembl_id ensembl_ids_reverse[ensembl_id] = hgnc_id gene_type = row[11] if gene_type: gene_types[hgnc_id] = gene_type enyzyme_ids = row[12] if enyzyme_ids: for enzyme_id in enyzyme_ids.split(", "): hgnc_to_enzymes[hgnc_id].add(enzyme_id) enzyme_to_hgncs[enzyme_id].add(hgnc_id) for old_id, new_id in hgnc_withdrawn_new_ids.items(): hgnc_names[old_id] = hgnc_names[new_id] return ( hgnc_names, hgnc_ids, hgnc_withdrawn, uniprot_ids, entrez_ids, entrez_ids_reverse, mouse_map, rat_map, prev_sym_map, ensembl_ids, ensembl_ids_reverse, gene_types, dict(hgnc_to_enzymes), dict(enzyme_to_hgncs), ) ( hgnc_names, hgnc_ids, hgnc_withdrawn, uniprot_ids, entrez_ids, entrez_ids_reverse, mouse_map, rat_map, prev_sym_map, ensembl_ids, ensembl_ids_reverse, gene_type, hgnc_to_enzymes, enzyme_to_hgncs, ) = _read_hgnc_maps() def _read_kinases(): fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources', 'kinases.tsv') kinase_table = read_unicode_csv(fname, delimiter='\t') gene_names = [lin[1] for lin in list(kinase_table)[1:]] return gene_names def _read_phosphatases(): fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources', 'phosphatases.tsv') p_table = read_unicode_csv(fname, delimiter='\t') # First column is phosphatase names # Second column is HGNC ids p_names = [row[0] for row in p_table] return p_names def _read_tfs(): fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources', 'transcription_factors.csv') tf_table = read_unicode_csv(fname) gene_names = [lin[1] for lin in list(tf_table)[1:]] return gene_names kinases, phosphatases, tfs = _read_kinases(), _read_phosphatases(), _read_tfs()