Source code for indra.tools.fix_invalidities

__all__ = ['fix_invalidities', 'fix_invalidities_db_refs',
           'fix_invalidities_agent', 'fix_invalidities_context',
           'fix_invalidities_stmt', 'fix_invalidities_evidence']

import re
import copy
from typing import List, Mapping
from indra.databases.identifiers import ensure_prefix_if_needed as \
    ensure_prefix_if_needed_identifiers
from indra.databases.identifiers import identifiers_registry
from indra.databases.bioregistry_client import ensure_prefix_if_needed as \
    ensure_prefix_if_needed_bioregistry
from indra.statements.validate import text_ref_patterns
from indra.statements import Evidence, Statement, Agent, BioContext, \
    Translocation



[docs]
def fix_invalidities(stmts: List[Statement]) -> List[Statement]:
    """Fix invalidities in a list of Statements.

    Note that in some cases statements can be filtered out if there is a known
    issue to which there is no fix, e.g., a Translocation statements
    missing both location parameters.

    Parameters
    ----------
    stmts :
        A list of INDRA Statements.

    Returns
    -------
    :
        The list of statements with invalidities fixed.
    """
    new_stmts = []
    for stmt in stmts:
        if isinstance(stmt, Translocation) and not stmt.from_location and \
                not stmt.to_location:
            continue
        fix_invalidities_stmt(stmt)
        new_stmts.append(stmt)
    return new_stmts




[docs]
def fix_invalidities_stmt(stmt: Statement):
    """Fix invalidities of a single INDRA Statement in place."""
    for ev in stmt.evidence:
        fix_invalidities_evidence(ev)
    for agent in stmt.real_agent_list():
        fix_invalidities_agent(agent)




[docs]
def fix_invalidities_evidence(ev: Evidence):
    """Fix invalidities of a single INDRA Evidence in place."""
    for k, v in copy.deepcopy(ev.text_refs).items():
        if v is None:
            ev.text_refs.pop(k, None)
        elif not k.isupper():
            ev.text_refs.pop(k)
            ev.text_refs[k.upper()] = v

    if ev.pmid and not re.match(text_ref_patterns['PMID'], ev.pmid):
        ev.pmid = None
    if ev.text_refs.get('PMID') and not re.match(text_ref_patterns['PMID'],
                                                 ev.text_refs['PMID']):
        ev.text_refs.pop('PMID', None)

    if ev.pmid is None and ev.text_refs.get('PMID') is not None:
        ev.pmid = ev.text_refs['PMID']
    elif ev.text_refs.get('PMID') is None and ev.pmid is not None:
        ev.text_refs['PMID'] = ev.pmid

    if 'DOI' in ev.text_refs and not re.match(text_ref_patterns['DOI'],
                                              ev.text_refs['DOI']):
        ev.text_refs.pop('DOI', None)
    if 'PMC' in ev.text_refs and not re.match(text_ref_patterns['PMC'],
                                              ev.text_refs['PMC']):
        ev.text_refs.pop('PMC', None)

    if ev.context is not None:
        fix_invalidities_context(ev.context)




[docs]
def fix_invalidities_agent(agent: Agent):
    """Fix invalidities of a single INDRA Agent in place."""
    agent.db_refs = fix_invalidities_db_refs(agent.db_refs)




[docs]
def fix_invalidities_db_refs(db_refs: Mapping[str, str]) -> Mapping[str, str]:
    """Return a fixed version of a db_refs grounding dict."""
    if 'PUBCHEM' in db_refs and \
            db_refs['PUBCHEM'].startswith('CID'):
        db_refs['PUBCHEM'] = \
            db_refs['PUBCHEM'].replace('CID:', '').strip()

    db_refs = {k: v for k, v in db_refs.items()
               if v is not None}

    for k, v in copy.deepcopy(db_refs).items():
        if k == 'CHEMBL' and not v.startswith('CHEMBL'):
            db_refs[k] = 'CHEMBL%s' % v
        elif k == 'ECCODE':
            db_refs['ECCODE'] = db_refs['ECCODE'].replace('.-', '')
        elif k == 'UNIPROT':
            db_refs.pop(k)
            # This is really a location
            if v.startswith('SL-'):
                db_refs['UPLOC'] = v
            # Otherwise we just fix the invalid key
            else:
                db_refs['UP'] = v
        elif k == 'UP':
            # There are cases where this is an empty string
            if not v.strip():
                db_refs.pop('UP', None)
            # Sometimes we have two IDs separated by a comma
            if ',' in v:
                db_refs['UP'] = v.split(',')[0]
            if v.startswith('SL-'):
                db_refs['UPLOC'] = db_refs.pop('UP')
            # There are cases where an isoform is under the UP key, we
            # standardize these. Note that the elif here is important to
            # avoid matching SL- here
            elif '-' in v:
                parts = v.split('-')
                db_refs['UP'] = parts[0]
                db_refs['UPISO'] = v
        elif k == 'UAZ':
            db_refs.pop('UAZ')
            if v.startswith('CVCL'):
                db_refs['CVCL'] = v
        elif k == 'TAXONOMY' and v == '-1':
            db_refs.pop('TAXONOMY', None)
        elif k == 'LINCS' and re.match(r'\d+-\d+', v):
            db_refs['HMS-LINCS'] = db_refs.pop('LINCS')
        elif k == 'CVCL' and re.match(r'^[A-Z0-9]{4}$', v):
            db_refs['CVCL'] = 'CVCL_%s' % v
        elif k == 'CO':
            db_refs['CL'] = 'CL:%s' % db_refs.pop('CO')
        elif k == 'FPLX' and '-' in v:
            db_refs['FPLX'] = v.replace('-', '_')
        elif k == 'DRUGBANK' and v.startswith('DBSALT'):
            db_refs['DRUGBANK.SALT'] = db_refs.pop('DRUGBANK')
        # For MGI and RGD some sources added names as IDs that are invalid
        # and not easily fixable without reverse lookups so we rather
        # remove these.
        elif k == 'MGI' and not re.match(
                identifiers_registry['mgi']['pattern'], v):
            db_refs.pop('MGI', None)
        elif k == 'RGD' and not re.match(
                identifiers_registry['rgd']['pattern'], v):
            db_refs.pop('RGD', None)
        # These were left over from RLIMS-P where they denote MESH IDs so
        # we can remove and replace these.
        elif k == 'CTD':
            if 'MESH' in db_refs:
                db_refs.pop('CTD')
            else:
                db_refs['MESH'] = db_refs.pop('CTD')
        else:
            # Since
            new_val = ensure_prefix_if_needed_identifiers(k, v)
            new_val = ensure_prefix_if_needed_bioregistry(k, new_val)
            db_refs[k] = new_val
    return db_refs




[docs]
def fix_invalidities_context(context: BioContext):
    """Fix invalidities of a single INDRA BioContext in place."""
    entries = [context.species, context.cell_line, context.disease,
               context.cell_type, context.organ, context.location]
    for entry in entries:
        if entry is not None:
            entry.db_refs = fix_invalidities_db_refs(entry.db_refs)