Source code for indra.tools.fix_invalidities

__all__ = ['fix_invalidities', 'fix_invalidities_db_refs',
           'fix_invalidities_agent', 'fix_invalidities_context',
           'fix_invalidities_stmt', 'fix_invalidities_evidence']

import re
import copy
from typing import List, Mapping
from indra.databases.identifiers import ensure_prefix_if_needed as \
    ensure_prefix_if_needed_identifiers
from indra.databases.identifiers import identifiers_registry
from indra.databases.bioregistry_client import ensure_prefix_if_needed as \
    ensure_prefix_if_needed_bioregistry
from indra.statements.validate import text_ref_patterns
from indra.statements import Evidence, Statement, Agent, BioContext, \
    Translocation


[docs]def fix_invalidities(stmts: List[Statement]) -> List[Statement]: """Fix invalidities in a list of Statements. Note that in some cases statements can be filtered out if there is a known issue to which there is no fix, e.g., a Translocation statements missing both location parameters. Parameters ---------- stmts : A list of INDRA Statements. Returns ------- : The list of statements with invalidities fixed. """ new_stmts = [] for stmt in stmts: if isinstance(stmt, Translocation) and not stmt.from_location and \ not stmt.to_location: continue fix_invalidities_stmt(stmt) new_stmts.append(stmt) return new_stmts
[docs]def fix_invalidities_stmt(stmt: Statement): """Fix invalidities of a single INDRA Statement in place.""" for ev in stmt.evidence: fix_invalidities_evidence(ev) for agent in stmt.real_agent_list(): fix_invalidities_agent(agent)
[docs]def fix_invalidities_evidence(ev: Evidence): """Fix invalidities of a single INDRA Evidence in place.""" for k, v in copy.deepcopy(ev.text_refs).items(): if v is None: ev.text_refs.pop(k, None) elif not k.isupper(): ev.text_refs.pop(k) ev.text_refs[k.upper()] = v if ev.pmid and not re.match(text_ref_patterns['PMID'], ev.pmid): ev.pmid = None if ev.text_refs.get('PMID') and not re.match(text_ref_patterns['PMID'], ev.text_refs['PMID']): ev.text_refs.pop('PMID', None) if ev.pmid is None and ev.text_refs.get('PMID') is not None: ev.pmid = ev.text_refs['PMID'] elif ev.text_refs.get('PMID') is None and ev.pmid is not None: ev.text_refs['PMID'] = ev.pmid if 'DOI' in ev.text_refs and not re.match(text_ref_patterns['DOI'], ev.text_refs['DOI']): ev.text_refs.pop('DOI', None) if 'PMC' in ev.text_refs and not re.match(text_ref_patterns['PMC'], ev.text_refs['PMC']): ev.text_refs.pop('PMC', None) if ev.context is not None: fix_invalidities_context(ev.context)
[docs]def fix_invalidities_agent(agent: Agent): """Fix invalidities of a single INDRA Agent in place.""" agent.db_refs = fix_invalidities_db_refs(agent.db_refs)
[docs]def fix_invalidities_db_refs(db_refs: Mapping[str, str]) -> Mapping[str, str]: """Return a fixed version of a db_refs grounding dict.""" if 'PUBCHEM' in db_refs and \ db_refs['PUBCHEM'].startswith('CID'): db_refs['PUBCHEM'] = \ db_refs['PUBCHEM'].replace('CID:', '').strip() db_refs = {k: v for k, v in db_refs.items() if v is not None} for k, v in copy.deepcopy(db_refs).items(): if k == 'CHEMBL' and not v.startswith('CHEMBL'): db_refs[k] = 'CHEMBL%s' % v elif k == 'ECCODE': db_refs['ECCODE'] = db_refs['ECCODE'].replace('.-', '') elif k == 'UNIPROT': db_refs.pop(k) # This is really a location if v.startswith('SL-'): db_refs['UPLOC'] = v # Otherwise we just fix the invalid key else: db_refs['UP'] = v elif k == 'UP': # There are cases where this is an empty string if not v.strip(): db_refs.pop('UP', None) # Sometimes we have two IDs separated by a comma if ',' in v: db_refs['UP'] = v.split(',')[0] if v.startswith('SL-'): db_refs['UPLOC'] = db_refs.pop('UP') # There are cases where an isoform is under the UP key, we # standardize these. Note that the elif here is important to # avoid matching SL- here elif '-' in v: parts = v.split('-') db_refs['UP'] = parts[0] db_refs['UPISO'] = v elif k == 'UAZ': db_refs.pop('UAZ') if v.startswith('CVCL'): db_refs['CVCL'] = v elif k == 'TAXONOMY' and v == '-1': db_refs.pop('TAXONOMY', None) elif k == 'LINCS' and re.match(r'\d+-\d+', v): db_refs['HMS-LINCS'] = db_refs.pop('LINCS') elif k == 'CVCL' and re.match(r'^[A-Z0-9]{4}$', v): db_refs['CVCL'] = 'CVCL_%s' % v elif k == 'CO': db_refs['CL'] = 'CL:%s' % db_refs.pop('CO') elif k == 'FPLX' and '-' in v: db_refs['FPLX'] = v.replace('-', '_') elif k == 'DRUGBANK' and v.startswith('DBSALT'): db_refs['DRUGBANK.SALT'] = db_refs.pop('DRUGBANK') # For MGI and RGD some sources added names as IDs that are invalid # and not easily fixable without reverse lookups so we rather # remove these. elif k == 'MGI' and not re.match( identifiers_registry['mgi']['pattern'], v): db_refs.pop('MGI', None) elif k == 'RGD' and not re.match( identifiers_registry['rgd']['pattern'], v): db_refs.pop('RGD', None) # These were left over from RLIMS-P where they denote MESH IDs so # we can remove and replace these. elif k == 'CTD': if 'MESH' in db_refs: db_refs.pop('CTD') else: db_refs['MESH'] = db_refs.pop('CTD') else: # Since new_val = ensure_prefix_if_needed_identifiers(k, v) new_val = ensure_prefix_if_needed_bioregistry(k, new_val) db_refs[k] = new_val return db_refs
[docs]def fix_invalidities_context(context: BioContext): """Fix invalidities of a single INDRA BioContext in place.""" entries = [context.species, context.cell_line, context.disease, context.cell_type, context.organ, context.location] for entry in entries: if entry is not None: entry.db_refs = fix_invalidities_db_refs(entry.db_refs)