Source code for indra.databases.bioregistry_client

"""This module implements a client for using namespace and identifiers
information from the Bioregistry (bioregistry.io)."""
import re
from indra.resources import load_resource_json

# Additional mappings that we need for getting a bioregistry prefix
# corresponding to an INDRA namespace, not covered by synonyms.
bioregistry_overrides = {
    'UP': 'uniprot',
    'UPPRO': 'uniprot.chain',
    'UPISO': 'uniprot.isoform',
    'UPLOC': 'uniprot.location',
    'REFSEQ_PROT': 'refseq',
    'PF': 'pfam',
    'IP': 'interpro',
    'NONCODE': 'noncodev4.rna',
    'LNCRNADB': 'rnacentral',
    'MIRBASEM': 'mirbase.mature',
    'EGID': 'ncbigene',
    'NCBI': 'ncbigene',
    'HGNC_GROUP': 'hgnc.genegroup',
    'LINCS': 'lincs.smallmolecule',
    'PUBCHEM': 'pubchem.compound',
    'CHEMBL': 'chembl.compound',
    'CVCL': 'cellosaurus',
    'HMS-LINCS': 'hms.lincs.compound',
    'NXPFA': 'nextprot.family',
    'TAXONOMY': 'ncbitaxon',
}


bioregistry_reverse_overrides = {v: k for k, v in bioregistry_overrides.items()}


[docs]def get_ns_from_bioregistry(bioregistry_prefix): """Return the INDRA namespace for the given Bioregistry prefix.""" # If the prefix is not in Bioregistry, we return None if bioregistry_prefix not in registry: return None # If there is an override mapping, we return that mapping = bioregistry_reverse_overrides.get(bioregistry_prefix) if mapping: return mapping # Otherwise, we are dealing with a simple capitalization # conversion return bioregistry_prefix.upper()
[docs]def get_ns_id_from_bioregistry(bioregistry_prefix, bioregistry_id): """Return the INDRA namespace and ID for a Bioregistry prefix and ID.""" # If the prefix is not in Bioregistry, we return None db_ns = get_ns_from_bioregistry(bioregistry_prefix) if not db_ns: return None, None banana = registry[bioregistry_prefix].get('banana') # There are some non-standard separators but we fall back to the standard # colon unless it's explicitly curated banana_peel = registry[bioregistry_prefix].get('banana_peel', ':') if banana: db_id = '%s%s%s' % (banana, banana_peel, bioregistry_id) else: db_id = bioregistry_id return db_ns, db_id
[docs]def get_ns_id_from_bioregistry_curie(bioregistry_curie): """Return the INDRA namespace and ID for a Bioregistry CURIE.""" # If the prefix is not in Bioregistry, we return None prefix, id = bioregistry_curie.split(':', maxsplit=1) return get_ns_id_from_bioregistry(prefix, id)
[docs]def get_bioregistry_prefix(db_ns): """Return the prefix for the given INDRA namespace in Bioregistry.""" # First if there is an explicit override, we return that if db_ns in bioregistry_overrides: return bioregistry_overrides[db_ns] # Next, if INDRA matches a curated synonym, we return that if db_ns in synonym_reverse: return synonym_reverse[db_ns] # Otherwise, we check if the lowercase version of the namespace if # a valid prefix and return that if db_ns.lower() in registry: return db_ns.lower() # If none of these match, we return None return None
[docs]def get_bioregistry_curie(db_ns, db_id): """Return the Bioregistry CURIE for the given INDRA namespace and ID.""" prefix = get_bioregistry_prefix(db_ns) if not prefix: return None banana = registry[prefix].get('banana') banana_peel = registry[prefix].get('banana_peel', ':') if banana: if db_id.startswith(banana): # The separator (banana peel) is usually one character long # but there are exceptions where it's an empty string. db_id = db_id[len(banana) + len(banana_peel):] return '%s:%s' % (prefix, db_id)
[docs]def get_bioregistry_url(db_ns, db_id): """Return the Bioregistry URL for the given INDRA namespace and ID.""" curie = get_bioregistry_curie(db_ns, db_id) if not curie: return None return 'https://bioregistry.io/%s' % curie
[docs]def ensure_prefix_if_needed(db_ns, db_id): """Return an ID ensuring a namespace prefix if known to be needed.""" prefix = get_bioregistry_prefix(db_ns) if not prefix: return db_id banana = registry[prefix].get('banana') banana_peel = registry[prefix].get('banana_peel', ':') if banana and not db_id.startswith(f'{banana}{banana_peel}'): return f'{banana}{banana_peel}{db_id}' return db_id
def _load_bioregistry(): registry = load_resource_json('bioregistry.json') synonym_reverse = {} for prefix, entry in registry.items(): # If there is a pattern we make a pre-compiled version of it # for faster matching. if 'pattern' in entry: pattern = entry['pattern'] # If there is a banana, we need to add it to the pattern if 'banana' in entry: banana_peel = entry.get('banana_peel', ':') pattern = '^%s%s%s' % (entry['banana'], banana_peel, pattern[1:] if pattern.startswith('^') else pattern) entry['pattern_compiled'] = re.compile(pattern) for synonym in entry.get('synonyms', []): synonym_reverse[synonym] = prefix return registry, synonym_reverse registry, synonym_reverse = _load_bioregistry()