Source code for indra.databases.mesh_client

import os
import re
import json
import requests
import itertools
from typing import List
from functools import lru_cache
from os.path import abspath, dirname, join, pardir
from indra.util import read_unicode_csv

MESH_URL = 'https://id.nlm.nih.gov/mesh/'
HERE = dirname(abspath(__file__))
RESOURCES = join(HERE, pardir, 'resources')
MESH_FILE = join(RESOURCES, 'mesh_id_label_mappings.tsv')
MESH_SUPP_FILE = join(RESOURCES, 'mesh_supp_id_label_mappings.tsv')
DB_MAPPINGS = join(RESOURCES, 'mesh_mappings.tsv')
CAS_MAPPINGS = join(RESOURCES, 'mesh_cas_mappings.tsv')


mesh_id_to_name = {}
mesh_name_to_id = {}
mesh_name_to_id_name = {}
mesh_id_to_tree_numbers = {}
mesh_supp_to_primary = {}


def _load_mesh_file(path, supplementary):
    it = read_unicode_csv(path, delimiter='\t')
    for terms in it:
        if supplementary:
            mesh_id, mesh_label, mesh_terms_str, mapped_to_str = terms
            mesh_supp_to_primary[mesh_id] = mapped_to_str.split(',')
        else:
            mesh_id, mesh_label, mesh_terms_str, tree_number_str = terms
            # This is a rare corner case where an entry is outside the
            # tree structure, e.g., D005260, D008297
            if not tree_number_str:
                continue
            mesh_id_to_tree_numbers[mesh_id] = tree_number_str.split('|')
        mesh_terms = mesh_terms_str.split('|') if mesh_terms_str else []
        mesh_id_to_name[mesh_id] = mesh_label
        mesh_name_to_id[mesh_label] = mesh_id
        for term in mesh_terms:
            mesh_name_to_id_name[term] = [mesh_id, mesh_label]


_load_mesh_file(MESH_FILE, supplementary=False)
if os.path.exists(MESH_SUPP_FILE):
    _load_mesh_file(MESH_SUPP_FILE, supplementary=True)


def _load_db_mappings(db_mappings_path, cas_mappings_path):
    def db_iter():
        for _, mesh_id, _, db_ns, db_id, _ in \
                read_unicode_csv(db_mappings_path, delimiter='\t'):
            yield mesh_id, db_ns, db_id

    def cas_iter():
        for mesh_id, cas_id in read_unicode_csv(cas_mappings_path,
                                                delimiter='\t'):
            yield mesh_id, 'CAS', cas_id

    mesh_to_db = {}
    db_to_mesh = {}
    to_db_ambigs = set()
    db_to_ambigs = set()
    for mesh_id, db_ns, db_id in itertools.chain(db_iter(), cas_iter()):
        # Make sure we don't add any one-to-many mappings
        if mesh_id in mesh_to_db:
            to_db_ambigs.add(mesh_id)
            mesh_to_db.pop(mesh_id, None)
        elif mesh_id not in to_db_ambigs:
            mesh_to_db[mesh_id] = (db_ns, db_id)
        # Make sure we don't add any one-to-many reverse mappings
        if (db_ns, db_id) in db_to_mesh:
            db_to_ambigs.add((db_ns, db_id))
            db_to_mesh.pop((db_ns, db_id), None)
        elif (db_ns, db_id) not in db_to_ambigs:
            db_to_mesh[(db_ns, db_id)] = mesh_id
    return mesh_to_db, db_to_mesh


mesh_to_db, db_to_mesh = _load_db_mappings(DB_MAPPINGS, CAS_MAPPINGS)


[docs]@lru_cache(maxsize=1000) def get_mesh_name_from_web(mesh_id): """Get the MESH label for the given MESH ID using the NLM REST API. Parameters ---------- mesh_id : str MESH Identifier, e.g. 'D003094'. Returns ------- str Label for the MESH ID, or None if the query failed or no label was found. """ url = MESH_URL + mesh_id + '.json' resp = requests.get(url) if resp.status_code != 200: return None mesh_json = resp.json() try: label = mesh_json['label']['@value'] except (KeyError, IndexError, TypeError) as e: return None return label
[docs]def get_mesh_name(mesh_id, offline=False): """Get the MESH label for the given MESH ID. Uses the mappings table in `indra/resources`; if the MESH ID is not listed there, falls back on the NLM REST API. Parameters ---------- mesh_id : str MESH Identifier, e.g. 'D003094'. offline : bool Whether to allow queries to the NLM REST API if the given MESH ID is not contained in INDRA's internal MESH mappings file. Default is False (allows REST API queries). Returns ------- str Label for the MESH ID, or None if the query failed or no label was found. """ indra_mesh_mapping = mesh_id_to_name.get(mesh_id) if offline or indra_mesh_mapping is not None: return indra_mesh_mapping # Look up the MESH mapping from NLM if we don't have it locally return get_mesh_name_from_web(mesh_id)
[docs]def get_mesh_id_name(mesh_term, offline=False): """Get the MESH ID and name for the given MESH term. Uses the mappings table in `indra/resources`; if the MESH term is not listed there, falls back on the NLM REST API. Parameters ---------- mesh_term : str MESH Descriptor or Concept name, e.g. 'Breast Cancer'. offline : bool Whether to allow queries to the NLM REST API if the given MESH term is not contained in INDRA's internal MESH mappings file. Default is False (allows REST API queries). Returns ------- tuple of strs Returns a 2-tuple of the form `(id, name)` with the ID of the descriptor corresponding to the MESH label, and the descriptor name (which may not exactly match the name provided as an argument if it is a Concept name). If the query failed, or no descriptor corresponding to the name was found, returns a tuple of (None, None). """ if not mesh_term: return None, None indra_mesh_id = mesh_name_to_id.get(mesh_term) if indra_mesh_id is not None: return indra_mesh_id, mesh_term indra_mesh_id, new_term = \ mesh_name_to_id_name.get(mesh_term, (None, None)) if indra_mesh_id is not None: return indra_mesh_id, new_term if offline: return None, None # Look up the MESH mapping from NLM if we don't have it locally return get_mesh_id_name_from_web(mesh_term)
@lru_cache(maxsize=1000) def submit_sparql_query(query_body): url = MESH_URL + 'sparql' query = '%s\n%s' % (mesh_rdf_prefixes, query_body) args = {'query': query, 'format': 'JSON', 'inference': 'true'} resp = requests.get(url, params=args) # Check status if resp.status_code != 200: return None try: # Try to parse the json response (this can raise exceptions if we # got no response). return resp.json() except Exception: return None
[docs]def get_mesh_id_name_from_web(mesh_term): """Get the MESH ID and name for the given MESH term using the NLM REST API. Parameters ---------- mesh_term : str MESH Descriptor or Concept name, e.g. 'Breast Cancer'. Returns ------- tuple of strs Returns a 2-tuple of the form `(id, name)` with the ID of the descriptor corresponding to the MESH label, and the descriptor name (which may not exactly match the name provided as an argument if it is a Concept name). If the query failed, or no descriptor corresponding to the name was found, returns a tuple of (None, None). """ query_body = """ SELECT ?d ?dName ?c ?cName FROM <http://id.nlm.nih.gov/mesh> WHERE { ?d a meshv:Descriptor . ?d meshv:concept ?c . ?d rdfs:label ?dName . ?c rdfs:label ?cName FILTER (REGEX(?dName,'^%s$','i') || REGEX(?cName,'^%s$','i')) } ORDER BY ?d """ % (mesh_term, mesh_term) mesh_json = submit_sparql_query(query_body) if mesh_json is None: return None, None try: # Choose the first entry (should usually be only one) id_uri = mesh_json['results']['bindings'][0]['d']['value'] name = mesh_json['results']['bindings'][0]['dName']['value'] except (KeyError, IndexError, json.decoder.JSONDecodeError) as e: return None, None # Strip the MESH prefix off the ID URI m = re.match('http://id.nlm.nih.gov/mesh/([A-Za-z0-9]*)', id_uri) assert m is not None id = m.groups()[0] return id, name
def mesh_isa(mesh_id1, mesh_id2): tns1 = get_mesh_tree_numbers(mesh_id1) tns2 = get_mesh_tree_numbers(mesh_id2) for t1, t2 in itertools.product(tns1, tns2): if t1.startswith(t2): return True return False def mesh_isa_web(mesh_id1, mesh_id2): query_body = """ SELECT DISTINCT ?o FROM <http://id.nlm.nih.gov/mesh> WHERE { mesh:%s meshv:broaderDescriptor+ ?o . } """ % mesh_id1 mesh_json = submit_sparql_query(query_body) if mesh_json is None: return False try: results = mesh_json['results']['bindings'] for result in results: id_uri = result['o']['value'] # Strip the MESH prefix off the ID URI m = re.match('http://id.nlm.nih.gov/mesh/([A-Za-z0-9]*)', id_uri) id = m.groups()[0] if mesh_id2 == id: return True return False except Exception: return False
[docs]def get_mesh_tree_numbers(mesh_id): """Return MeSH tree IDs associated with a MeSH ID from the resource file. This function can handle supplementary concepts by first mapping them to primary terms and then collecting all the tree numbers for the mapped primary terms. Parameters ---------- mesh_id : str The MeSH ID whose tree IDs should be returned. Returns ------- list[str] A list of MeSH tree IDs. """ # Handle supplementary concepts if mesh_id and mesh_id.startswith('C'): primary_ids = get_primary_mappings(mesh_id) all_tree_ids = set() for primary_id in primary_ids: all_tree_ids |= set(mesh_id_to_tree_numbers.get(primary_id, [])) return list(all_tree_ids) # Handle primary terms else: return mesh_id_to_tree_numbers.get(mesh_id, [])
[docs]def get_mesh_tree_numbers_from_web(mesh_id): """Return MeSH tree IDs associated with a MeSH ID from the web. Parameters ---------- mesh_id : str The MeSH ID whose tree IDs should be returned. Returns ------- list[str] A list of MeSH tree IDs. """ query_body = """ SELECT DISTINCT ?tn FROM <http://id.nlm.nih.gov/mesh> WHERE { mesh:%s meshv:treeNumber ?tn } """ % mesh_id mesh_json = submit_sparql_query(query_body) if mesh_json is None: return [] try: tree_numbers = [] results = mesh_json['results']['bindings'] for res in results: tree_uri = res['tn']['value'] m = re.match('http://id.nlm.nih.gov/mesh/([A-Z0-9.]*)', tree_uri) tree = m.groups()[0] tree_numbers.append(tree) return tree_numbers except Exception: return []
[docs]def has_tree_prefix(mesh_id, tree_prefix): """Return True if the given MeSH ID has the given tree prefix.""" tree_numbers = get_mesh_tree_numbers(mesh_id) return any(tn.startswith(tree_prefix) for tn in tree_numbers)
[docs]def is_disease(mesh_id): """Return True if the given MeSH ID is a disease.""" return has_tree_prefix(mesh_id, 'C')
[docs]def is_molecular(mesh_id): """Return True if the given MeSH ID is a chemical or drug (incl protein).""" return has_tree_prefix(mesh_id, 'D')
[docs]def is_enzyme(mesh_id): """Return True if the given MeSH ID is an enzyme.""" return has_tree_prefix(mesh_id, 'D08')
[docs]def is_protein(mesh_id): """Return True if the given MeSH ID is a protein.""" return has_tree_prefix(mesh_id, 'D12')
[docs]def get_go_id(mesh_id): """Return a GO ID corresponding to the given MeSH ID. Parameters ---------- mesh_id : str MeSH ID to map to GO Returns ------- str The GO ID corresponding to the given MeSH ID, or None if not available. """ res = get_db_mapping(mesh_id) if res and res[0] == 'GO': return res[1] return None
[docs]def get_mesh_id_from_go_id(go_id): """Return a MeSH ID corresponding to the given GO ID. Parameters ---------- go_id : str GO ID to map to MeSH Returns ------- str The MeSH ID corresponding to the given GO ID, or None if not available. """ return get_mesh_id_from_db_id('GO', go_id)
[docs]def get_db_mapping(mesh_id): """Return mapping to another name space for a MeSH ID, if it exists. Parameters ---------- mesh_id : str The MeSH ID whose mappings is to be returned. Returns ------- tuple or None A tuple consisting of a DB namespace and ID for the mapping or None if not available. """ return mesh_to_db.get(mesh_id)
[docs]def get_mesh_id_from_db_id(db_ns, db_id): """Return a MeSH ID mapped from another namespace and ID. Parameters ---------- db_ns : str A namespace corresponding to db_id. db_id : str An ID in the given namespace. Returns ------- str or None The MeSH ID corresponding to the given namespace and ID if available, otherwise None. """ return db_to_mesh.get((db_ns, db_id))
[docs]def get_primary_mappings(db_id: str) -> List[str]: """Return the list of primary terms a supplementary term is mapped to. See https://www.nlm.nih.gov/mesh/xml_data_elements.html#HeadingMappedTo. Parameters ---------- db_id : A supplementary MeSH ID. Returns ------- : The list of primary MeSH terms that the supplementary concept is heading-mapped to. """ return mesh_supp_to_primary.get(db_id, [])
mesh_rdf_prefixes = """ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#> PREFIX mesh: <http://id.nlm.nih.gov/mesh/> PREFIX mesh2019: <http://id.nlm.nih.gov/mesh/2019/> PREFIX mesh2018: <http://id.nlm.nih.gov/mesh/2018/> PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/> """