Source code for indra.statements.validate

"""This module implements a number of functions that can be used to
validate INDRA Statements. The available functions include ones that
raise custom exceptions derived from ValueError if an invalidity is
found. These come with a helpful error message that can be caught
and printed to learn about the specific issue. Another set of functions
do not raise exceptions, rather, return True or False depending on whether
the given input is valid or invalid.

For validating namespaces and identifiers, there are two validators available,
one that uses data from identifiers.org and another for Bioregistry.
"""

import re
import logging
from indra.statements import *
from indra.databases import bioregistry_client, identifiers


logger = logging.getLogger(__name__)


# These are namespaces that can appear in db_refs but are actually not
# representing grounding.
non_grounding = {
    'TEXT', 'TEXT_NORM'
}


text_ref_patterns = {
    'PMID': re.compile(r'(\d+)'),
    'PMCID': re.compile(r'PMC(\d+)'),
    # See https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    # here I added a-z to allow non-capitalized DOI text, we could
    # change this is we want strict capital letters in the DOI
    'DOI':  re.compile(r'^10.\d{4,9}/[-._;()/:A-Za-z0-9]+'),
}


[docs]class UnknownNamespace(ValueError): def __str__(self): return f'Unknown namespace: {self.args[0]}'
[docs]class MissingIdentifier(ValueError): """Raised when the identifier is None.""" def __str__(self): return f'Missing identifier for {self.args[0]}'
[docs]class InvalidIdentifier(ValueError): """Raised when the identifier doesn't match the pattern.""" def __str__(self): return f'Invalid identifier: {self.args[1]} for {self.args[0]} ' \ f'pattern {self.args[2]}'
[docs]class UnknownIdentifier(ValueError): """Raise when the database is neither registered with identifiers.org or manually added to the :data:`indra.databases.identifiers.non_registry` list. """ def __str__(self): return f'Unknown identifier: {self.args[0]}:{self.args[1]}'
[docs]class InvalidTextRefs(ValueError): pass
[docs]class InvalidContext(ValueError): pass
[docs]class InvalidAgent(ValueError): pass
[docs]class InvalidStatement(ValueError): pass
[docs]class IdentifiersValidator: """A class that can be used to validate INDRA Statements.""" @staticmethod def assert_valid_ns(db_ns): if db_ns in non_grounding or db_ns in identifiers.non_registry: return identifiers_ns = \ identifiers.identifiers_mappings.get(db_ns, db_ns.lower()) if identifiers_ns in identifiers.identifiers_registry: return raise UnknownNamespace(db_ns) @staticmethod def assert_valid_id(db_ns, db_id): if db_id is None: raise MissingIdentifier(db_ns, None) if db_ns in non_grounding or db_ns in identifiers.non_registry: return identifiers_ns = \ identifiers.identifiers_mappings.get(db_ns, db_ns.lower()) if identifiers_ns in identifiers.identifiers_registry: pattern = identifiers.identifiers_registry[ identifiers_ns]['pattern_compiled'] if pattern.match(db_id): return else: raise InvalidIdentifier(db_ns, db_id, pattern.pattern) else: raise UnknownIdentifier(db_ns, db_id)
[docs]class BioregistryValidator: """A class that can be used to validate INDRA Statements.""" @staticmethod def assert_valid_ns(db_ns): if db_ns in non_grounding: return prefix = bioregistry_client.bioregistry_overrides.get(db_ns, db_ns.lower()) if prefix in bioregistry_client.registry: return raise UnknownNamespace(db_ns) @staticmethod def assert_valid_id(db_ns, db_id): if db_id is None: raise MissingIdentifier(db_ns, None) if db_ns in non_grounding: return prefix = bioregistry_client.bioregistry_overrides.get(db_ns, db_ns.lower()) if prefix in bioregistry_client.registry: pattern = bioregistry_client.registry[prefix].get( 'pattern_compiled') if not pattern or pattern.match(db_id): return else: raise InvalidIdentifier(db_ns, db_id, pattern.pattern) else: raise UnknownIdentifier(db_ns, db_id)
default_validator = IdentifiersValidator()
[docs]def validate_ns(db_ns, validator=default_validator): """Return True if the given namespace is known. Parameters ---------- db_ns : str The namespace. Returns ------- bool True if the given namepsace is known, otherwise False. """ try: assert_valid_ns(db_ns, validator=validator) return True except ValueError: return False
[docs]def assert_valid_ns(db_ns, validator=default_validator): """Raise UnknownNamespace error if the given namespace is unknown. Parameters ---------- db_ns : str The namespace. """ return validator.assert_valid_ns(db_ns)
[docs]def validate_id(db_ns, db_id, validator=default_validator): """Return True if the given ID is valid in the given namespace. Parameters ---------- db_ns : str The namespace. db_id : str The ID. Returns ------- bool True if the given ID is valid in the given namespace. """ try: assert_valid_id(db_ns, db_id, validator=validator) return True except ValueError: return False
[docs]def assert_valid_id(db_ns, db_id, validator=default_validator): """Raise InvalidIdentifier error if the ID is invalid in the given namespace. Parameters ---------- db_ns : str The namespace. db_id : str The ID. """ return validator.assert_valid_id(db_ns, db_id)
[docs]def validate_db_refs(db_refs, validator=default_validator): """Return True if all the entries in the given db_refs are valid. Parameters ---------- db_refs : dict A dict of database references, typically part of an INDRA Agent. Returns ------- bool True if all the entries are valid, else False. """ try: assert_valid_db_refs(db_refs, validator=validator) return True except ValueError: return False
[docs]def assert_valid_db_refs(db_refs, validator=default_validator): """Raise InvalidIdentifier error if any of the entries in the given db_refs are invalid. Parameters ---------- db_refs : dict A dict of database references, typically part of an INDRA Agent. """ for db_ns, db_id in db_refs.items(): assert_valid_ns(db_ns, validator=validator) assert_valid_id(db_ns, db_id, validator=validator)
[docs]def assert_valid_agent(agent, validator=default_validator): """Raise InvalidAgent is there is an invalidity in the Agent. Parameters ---------- agent : indra.statements.Agent The agent to check. """ if agent is None: return if agent.name is None: raise InvalidAgent('Agent missing name') assert_valid_db_refs(agent.db_refs, validator=validator)
[docs]def validate_agent(agent, validator=default_validator): """Return False if is there is an invalidity in the Agent, otherwise True. Parameters ---------- agent : indra.statements.Agent The agent to check. Returns ------- bool True if the agent is valid, False otherwise. """ try: assert_valid_agent(agent, validator=validator) return True except ValueError: return False
[docs]def assert_valid_statement_semantics(stmt): """Raise InvalidStatement error if the given statement is invalid. Parameters ---------- statement : indra.statements.Statement The statement to check. """ if all(a is None for a in stmt.agent_list()): raise InvalidStatement('Statement with all None agents') if isinstance(stmt, Complex): if any(m is None for m in stmt.members): raise InvalidStatement('Complex with None agent.') elif isinstance(stmt, Conversion): if any(o is None for o in stmt.obj_from): raise InvalidStatement('Conversion with None reactant.') if any(o is None for o in stmt.obj_to): raise InvalidStatement('Conversion with None product.') elif isinstance(stmt, RegulateActivity): if stmt.subj is None: raise InvalidStatement('Regulation missing subject.') if stmt.obj is None: raise InvalidStatement('Regulation missing object.') elif isinstance(stmt, RegulateAmount): if stmt.obj is None: raise InvalidStatement('Regulation missing object.') elif isinstance(stmt, (Gap, Gef)): if any(a is None for a in stmt.agent_list()): raise InvalidStatement('Gap/Gef missing agent.') elif isinstance(stmt, Modification): if stmt.sub is None: raise InvalidStatement('Modification missing substrate.') elif isinstance(stmt, Translocation): if stmt.from_location is None and stmt.to_location is None: raise InvalidStatement('Translocation with no locations.')
[docs]def validate_statement(stmt, validator=default_validator): """Return True if all the groundings in the given statement are valid. Parameters ---------- stmt : indra.statements.Statement An INDRA Statement to validate. Returns ------- bool True if all the db_refs entries of the Agents in the given Statement are valid, else False. """ try: assert_valid_statement(stmt, validator=validator) return True # Some deeper validation checks in statements raise TypeErrors except (ValueError, TypeError): return False
[docs]def assert_valid_statement(stmt, validator=default_validator): """Raise an error if there is anything invalid in the given statement. Parameters ---------- stmt : indra.statements.Statement An INDRA Statement to validate. """ assert_valid_statement_semantics(stmt) for agent in stmt.real_agent_list(): assert_valid_agent(agent, validator=validator) for ev in stmt.evidence: assert_valid_evidence(ev, validator=validator)
[docs]def assert_valid_statements(stmts, validator=default_validator): """Raise an error of any of the given statements is invalid. Parameters ---------- stmts : list[indra.statements.Statement] A list of INDRA Statements to validate. """ for stmt in stmts: assert_valid_statement(stmt, validator=validator)
[docs]def assert_valid_text_refs(text_refs): """Raise an InvalidTextRefs error if the given text refs are invalid.""" for ns in text_refs: if ns != ns.upper(): raise InvalidTextRefs(ns) for ns, pattern in text_ref_patterns.items(): if ns in text_refs: if text_refs[ns] is None: raise InvalidTextRefs(f'{ns}:{text_refs[ns]}') elif not re.match(pattern, text_refs[ns]): raise InvalidTextRefs(f'{ns}:{text_refs[ns]}')
[docs]def validate_text_refs(text_refs): """Return True if the given text refs are valid.""" try: assert_valid_text_refs(text_refs) return True except ValueError: return False
[docs]def assert_valid_pmid_text_refs(evidence): """Return True if the pmid attribute is consistent with text refs""" tr_pmid = evidence.text_refs.get('PMID') if tr_pmid is not None: if evidence.pmid != tr_pmid: raise InvalidTextRefs(f'Evidence pmid {evidence.pmid} doesn\'t ' f'match text refs pmid {tr_pmid}')
[docs]def assert_valid_bio_context(context, validator=default_validator): """Raise InvalidContext error if the given bio-context is invalid. Parameters ---------- context : indra.statements.BioContext The context object to validate. """ # We shouldn't make a context if the attributes are all None if all(getattr(context, attr) is None for attr in BioContext.attrs): raise InvalidContext('All context attributes are None.') # Here we check db_refs validity for attr in BioContext.attrs: val = getattr(context, attr) if val is None: continue if val is not None and not isinstance(val, RefContext): raise InvalidContext(f'Invalid context entry for {attr}') assert_valid_db_refs(val.db_refs, validator=validator)
[docs]def assert_valid_context(context, validator=default_validator): """Raise InvalidContext error if the given context is invalid. Parameters ---------- context : indra.statements.Context The context object to validate. """ if context is None: return elif isinstance(context, BioContext): assert_valid_bio_context(context, validator=validator) elif isinstance(context, WorldContext): return
[docs]def assert_valid_evidence(evidence, validator=default_validator): """Raise an error if the given evidence is invalid. Parameters ---------- evidence : indra.statements.Evidence The evidence object to validate. """ assert_valid_pmid_text_refs(evidence) assert_valid_text_refs(evidence.text_refs) assert_valid_context(evidence.context, validator=validator)
[docs]def validate_evidence(evidence, validator=default_validator): """Return False if the given evidence is invalid, otherwise True. Parameters ---------- evidence : indra.statements.Evidence The evidence object to validate. Returns ------- bool True if the evidence is valid, otherwise False. """ try: assert_valid_evidence(evidence, validator=validator) return True except ValueError: return False