import os
import re
import json
import requests
import itertools
from typing import List
from functools import lru_cache
from os.path import abspath, dirname, join, pardir
from indra.util import read_unicode_csv
MESH_URL = 'https://id.nlm.nih.gov/mesh/'
HERE = dirname(abspath(__file__))
RESOURCES = join(HERE, pardir, 'resources')
MESH_FILE = join(RESOURCES, 'mesh_id_label_mappings.tsv')
MESH_SUPP_FILE = join(RESOURCES, 'mesh_supp_id_label_mappings.tsv')
DB_MAPPINGS = join(RESOURCES, 'mesh_mappings.tsv')
CAS_MAPPINGS = join(RESOURCES, 'mesh_cas_mappings.tsv')
mesh_id_to_name = {}
mesh_name_to_id = {}
mesh_name_to_id_name = {}
mesh_id_to_tree_numbers = {}
mesh_supp_to_primary = {}
def _load_mesh_file(path, supplementary):
it = read_unicode_csv(path, delimiter='\t')
for terms in it:
if supplementary:
mesh_id, mesh_label, mesh_terms_str, mapped_to_str = terms
mesh_supp_to_primary[mesh_id] = mapped_to_str.split(',')
else:
mesh_id, mesh_label, mesh_terms_str, tree_number_str = terms
# This is a rare corner case where an entry is outside the
# tree structure, e.g., D005260, D008297
if not tree_number_str:
continue
mesh_id_to_tree_numbers[mesh_id] = tree_number_str.split('|')
mesh_terms = mesh_terms_str.split('|') if mesh_terms_str else []
mesh_id_to_name[mesh_id] = mesh_label
mesh_name_to_id[mesh_label] = mesh_id
for term in mesh_terms:
mesh_name_to_id_name[term] = [mesh_id, mesh_label]
_load_mesh_file(MESH_FILE, supplementary=False)
if os.path.exists(MESH_SUPP_FILE):
_load_mesh_file(MESH_SUPP_FILE, supplementary=True)
def _load_db_mappings(db_mappings_path, cas_mappings_path):
def db_iter():
for _, mesh_id, _, db_ns, db_id, _ in \
read_unicode_csv(db_mappings_path, delimiter='\t'):
yield mesh_id, db_ns, db_id
def cas_iter():
for mesh_id, cas_id in read_unicode_csv(cas_mappings_path,
delimiter='\t'):
yield mesh_id, 'CAS', cas_id
mesh_to_db = {}
db_to_mesh = {}
to_db_ambigs = set()
db_to_ambigs = set()
for mesh_id, db_ns, db_id in itertools.chain(db_iter(), cas_iter()):
# Make sure we don't add any one-to-many mappings
if mesh_id in mesh_to_db:
to_db_ambigs.add(mesh_id)
mesh_to_db.pop(mesh_id, None)
elif mesh_id not in to_db_ambigs:
mesh_to_db[mesh_id] = (db_ns, db_id)
# Make sure we don't add any one-to-many reverse mappings
if (db_ns, db_id) in db_to_mesh:
db_to_ambigs.add((db_ns, db_id))
db_to_mesh.pop((db_ns, db_id), None)
elif (db_ns, db_id) not in db_to_ambigs:
db_to_mesh[(db_ns, db_id)] = mesh_id
return mesh_to_db, db_to_mesh
mesh_to_db, db_to_mesh = _load_db_mappings(DB_MAPPINGS, CAS_MAPPINGS)
[docs]@lru_cache(maxsize=1000)
def get_mesh_name_from_web(mesh_id):
"""Get the MESH label for the given MESH ID using the NLM REST API.
Parameters
----------
mesh_id : str
MESH Identifier, e.g. 'D003094'.
Returns
-------
str
Label for the MESH ID, or None if the query failed or no label was
found.
"""
url = MESH_URL + mesh_id + '.json'
resp = requests.get(url)
if resp.status_code != 200:
return None
mesh_json = resp.json()
try:
label = mesh_json['label']['@value']
except (KeyError, IndexError, TypeError) as e:
return None
return label
[docs]def get_mesh_name(mesh_id, offline=False):
"""Get the MESH label for the given MESH ID.
Uses the mappings table in `indra/resources`; if the MESH ID is not listed
there, falls back on the NLM REST API.
Parameters
----------
mesh_id : str
MESH Identifier, e.g. 'D003094'.
offline : bool
Whether to allow queries to the NLM REST API if the given MESH ID is
not contained in INDRA's internal MESH mappings file. Default is False
(allows REST API queries).
Returns
-------
str
Label for the MESH ID, or None if the query failed or no label was
found.
"""
indra_mesh_mapping = mesh_id_to_name.get(mesh_id)
if offline or indra_mesh_mapping is not None:
return indra_mesh_mapping
# Look up the MESH mapping from NLM if we don't have it locally
return get_mesh_name_from_web(mesh_id)
[docs]def get_mesh_id_name(mesh_term, offline=False):
"""Get the MESH ID and name for the given MESH term.
Uses the mappings table in `indra/resources`; if the MESH term is not
listed there, falls back on the NLM REST API.
Parameters
----------
mesh_term : str
MESH Descriptor or Concept name, e.g. 'Breast Cancer'.
offline : bool
Whether to allow queries to the NLM REST API if the given MESH term is
not contained in INDRA's internal MESH mappings file. Default is False
(allows REST API queries).
Returns
-------
tuple of strs
Returns a 2-tuple of the form `(id, name)` with the ID of the
descriptor corresponding to the MESH label, and the descriptor name
(which may not exactly match the name provided as an argument if it is
a Concept name). If the query failed, or no descriptor corresponding to
the name was found, returns a tuple of (None, None).
"""
if not mesh_term:
return None, None
indra_mesh_id = mesh_name_to_id.get(mesh_term)
if indra_mesh_id is not None:
return indra_mesh_id, mesh_term
indra_mesh_id, new_term = \
mesh_name_to_id_name.get(mesh_term, (None, None))
if indra_mesh_id is not None:
return indra_mesh_id, new_term
if offline:
return None, None
# Look up the MESH mapping from NLM if we don't have it locally
return get_mesh_id_name_from_web(mesh_term)
@lru_cache(maxsize=1000)
def submit_sparql_query(query_body):
url = MESH_URL + 'sparql'
query = '%s\n%s' % (mesh_rdf_prefixes, query_body)
args = {'query': query, 'format': 'JSON', 'inference': 'true'}
resp = requests.get(url, params=args)
# Check status
if resp.status_code != 200:
return None
try:
# Try to parse the json response (this can raise exceptions if we
# got no response).
return resp.json()
except Exception:
return None
[docs]def get_mesh_id_name_from_web(mesh_term):
"""Get the MESH ID and name for the given MESH term using the NLM REST API.
Parameters
----------
mesh_term : str
MESH Descriptor or Concept name, e.g. 'Breast Cancer'.
Returns
-------
tuple of strs
Returns a 2-tuple of the form `(id, name)` with the ID of the
descriptor corresponding to the MESH label, and the descriptor name
(which may not exactly match the name provided as an argument if it is
a Concept name). If the query failed, or no descriptor corresponding to
the name was found, returns a tuple of (None, None).
"""
query_body = """
SELECT ?d ?dName ?c ?cName
FROM <http://id.nlm.nih.gov/mesh>
WHERE {
?d a meshv:Descriptor .
?d meshv:concept ?c .
?d rdfs:label ?dName .
?c rdfs:label ?cName
FILTER (REGEX(?dName,'^%s$','i') || REGEX(?cName,'^%s$','i'))
}
ORDER BY ?d
""" % (mesh_term, mesh_term)
mesh_json = submit_sparql_query(query_body)
if mesh_json is None:
return None, None
try:
# Choose the first entry (should usually be only one)
id_uri = mesh_json['results']['bindings'][0]['d']['value']
name = mesh_json['results']['bindings'][0]['dName']['value']
except (KeyError, IndexError, json.decoder.JSONDecodeError) as e:
return None, None
# Strip the MESH prefix off the ID URI
m = re.match('http://id.nlm.nih.gov/mesh/([A-Za-z0-9]*)', id_uri)
assert m is not None
id = m.groups()[0]
return id, name
def mesh_isa(mesh_id1, mesh_id2):
tns1 = get_mesh_tree_numbers(mesh_id1)
tns2 = get_mesh_tree_numbers(mesh_id2)
for t1, t2 in itertools.product(tns1, tns2):
if t1.startswith(t2):
return True
return False
def mesh_isa_web(mesh_id1, mesh_id2):
query_body = """
SELECT DISTINCT ?o
FROM <http://id.nlm.nih.gov/mesh>
WHERE {
mesh:%s meshv:broaderDescriptor+ ?o .
}
""" % mesh_id1
mesh_json = submit_sparql_query(query_body)
if mesh_json is None:
return False
try:
results = mesh_json['results']['bindings']
for result in results:
id_uri = result['o']['value']
# Strip the MESH prefix off the ID URI
m = re.match('http://id.nlm.nih.gov/mesh/([A-Za-z0-9]*)', id_uri)
id = m.groups()[0]
if mesh_id2 == id:
return True
return False
except Exception:
return False
[docs]def get_mesh_tree_numbers(mesh_id):
"""Return MeSH tree IDs associated with a MeSH ID from the resource file.
This function can handle supplementary concepts by first mapping them
to primary terms and then collecting all the tree numbers for the mapped
primary terms.
Parameters
----------
mesh_id : str
The MeSH ID whose tree IDs should be returned.
Returns
-------
list[str]
A list of MeSH tree IDs.
"""
# Handle supplementary concepts
if mesh_id and mesh_id.startswith('C'):
primary_ids = get_primary_mappings(mesh_id)
all_tree_ids = set()
for primary_id in primary_ids:
all_tree_ids |= set(mesh_id_to_tree_numbers.get(primary_id, []))
return list(all_tree_ids)
# Handle primary terms
else:
return mesh_id_to_tree_numbers.get(mesh_id, [])
[docs]def get_mesh_tree_numbers_from_web(mesh_id):
"""Return MeSH tree IDs associated with a MeSH ID from the web.
Parameters
----------
mesh_id : str
The MeSH ID whose tree IDs should be returned.
Returns
-------
list[str]
A list of MeSH tree IDs.
"""
query_body = """
SELECT DISTINCT ?tn
FROM <http://id.nlm.nih.gov/mesh>
WHERE {
mesh:%s meshv:treeNumber ?tn
}
""" % mesh_id
mesh_json = submit_sparql_query(query_body)
if mesh_json is None:
return []
try:
tree_numbers = []
results = mesh_json['results']['bindings']
for res in results:
tree_uri = res['tn']['value']
m = re.match('http://id.nlm.nih.gov/mesh/([A-Z0-9.]*)', tree_uri)
tree = m.groups()[0]
tree_numbers.append(tree)
return tree_numbers
except Exception:
return []
[docs]def has_tree_prefix(mesh_id, tree_prefix):
"""Return True if the given MeSH ID has the given tree prefix."""
tree_numbers = get_mesh_tree_numbers(mesh_id)
return any(tn.startswith(tree_prefix) for tn in tree_numbers)
[docs]def is_disease(mesh_id):
"""Return True if the given MeSH ID is a disease."""
return has_tree_prefix(mesh_id, 'C')
[docs]def is_molecular(mesh_id):
"""Return True if the given MeSH ID is a chemical or drug (incl protein)."""
return has_tree_prefix(mesh_id, 'D')
[docs]def is_enzyme(mesh_id):
"""Return True if the given MeSH ID is an enzyme."""
return has_tree_prefix(mesh_id, 'D08')
[docs]def is_protein(mesh_id):
"""Return True if the given MeSH ID is a protein."""
return has_tree_prefix(mesh_id, 'D12')
[docs]def get_go_id(mesh_id):
"""Return a GO ID corresponding to the given MeSH ID.
Parameters
----------
mesh_id : str
MeSH ID to map to GO
Returns
-------
str
The GO ID corresponding to the given MeSH ID, or None if not available.
"""
res = get_db_mapping(mesh_id)
if res and res[0] == 'GO':
return res[1]
return None
[docs]def get_mesh_id_from_go_id(go_id):
"""Return a MeSH ID corresponding to the given GO ID.
Parameters
----------
go_id : str
GO ID to map to MeSH
Returns
-------
str
The MeSH ID corresponding to the given GO ID, or None if not
available.
"""
return get_mesh_id_from_db_id('GO', go_id)
[docs]def get_db_mapping(mesh_id):
"""Return mapping to another name space for a MeSH ID, if it exists.
Parameters
----------
mesh_id : str
The MeSH ID whose mappings is to be returned.
Returns
-------
tuple or None
A tuple consisting of a DB namespace and ID for the mapping or None
if not available.
"""
return mesh_to_db.get(mesh_id)
[docs]def get_mesh_id_from_db_id(db_ns, db_id):
"""Return a MeSH ID mapped from another namespace and ID.
Parameters
----------
db_ns : str
A namespace corresponding to db_id.
db_id : str
An ID in the given namespace.
Returns
-------
str or None
The MeSH ID corresponding to the given namespace and ID if available,
otherwise None.
"""
return db_to_mesh.get((db_ns, db_id))
[docs]def get_primary_mappings(db_id: str) -> List[str]:
"""Return the list of primary terms a supplementary term is mapped to.
See https://www.nlm.nih.gov/mesh/xml_data_elements.html#HeadingMappedTo.
Parameters
----------
db_id :
A supplementary MeSH ID.
Returns
-------
:
The list of primary MeSH terms that the supplementary concept
is heading-mapped to.
"""
return mesh_supp_to_primary.get(db_id, [])
mesh_rdf_prefixes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
PREFIX mesh2019: <http://id.nlm.nih.gov/mesh/2019/>
PREFIX mesh2018: <http://id.nlm.nih.gov/mesh/2018/>
PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>
"""