Source code for indra.sources.hypothesis.processor

import re
import logging
from indra.statements import BioContext, RefContext
from indra.ontology.bio import bio_ontology
from indra.ontology.standardize import \
    standardize_db_refs

logger = logging.getLogger(__name__)


[docs]class HypothesisProcessor:
    """Processes hypothes.is annotations into INDRA Statements or groundings.

    Parameters
    ----------
    annotations : list[dict]
        A list of annotations fetched from hypothes.is in JSON-deserialized
        form represented as a list of dicts.
    reader : Union[None, str, Callable[[str],Processor]]
        A handle for a function which takes a single str argument
        (text to process) and returns a processor object with a statements
        attribute containing INDRA Statements. By default, the REACH reader's
        process_text function is used with default parameters. Note that
        if the function requires extra parameters other than the input text,
        functools.partial can be used to set those.
    grounder : Optional[function]
        A handle for a function which takes a positional str argument (entity
        text to ground) and an optional context key word argument and returns
        a list of objects matching the structure of gilda.grounder.ScoredMatch.
        By default, Gilda's ground function is used for grounding.

    Attributes
    ----------
    statements : list[indra.statements.Statement]
        A list of INDRA Statements extracted from the given annotations.
    groundings : dict
        A dict of entity text keys with an associated dict of grounding
        references.
    """
    def __init__(self, annotations, reader=None, grounder=None):
        self.annotations = annotations
        self.statements = []
        self.groundings = {}
        if reader is None or reader == 'reach':
            from indra.sources import reach
            self.reader = reach.process_text
        elif reader == 'bel':
            from indra.sources import bel
            self.reader = bel.process_bel_stmt
        else:
            self.reader = reader
        if grounder is None:
            from gilda import ground
            self.grounder = ground
        else:
            self.grounder = grounder

[docs]    def extract_statements(self):
        """Sets statements attribute to list of extracted INDRA Statements."""
        for annotation in self.annotations:
            tags = annotation.get('tags')
            # Allow no tags or indra as a tag
            if not tags or 'indra' in tags:
                stmts = self.stmts_from_annotation(annotation)
                if stmts:
                    self.statements += stmts

[docs]    def extract_groundings(self):
        """Sets groundings attribute to list of extracted groundings."""
        for annotation in self.annotations:
            tags = annotation.get('tags')
            if tags and 'gilda' in tags:
                groundings = self.groundings_from_annotation(annotation)
                if groundings:
                    for txt, refs in groundings.items():
                        if txt in self.groundings and \
                                (self.groundings[txt] != refs):
                            logger.info(
                                'There is already a curation for %s: %s, '
                                'overwriting with %s' % (txt,
                                                         str(groundings[txt]),
                                                         str(refs)))

                        self.groundings[txt] = refs

[docs]    @staticmethod
    def groundings_from_annotation(annotation):
        """Return a dict of groundings from a single annotation."""
        text = annotation.get('text')
        if not text:
            return {}
        parts = [t for t in text.split('\n') if t]
        groundings = {}
        for entry in parts:
            grounding = parse_grounding_entry(entry)
            if grounding:
                groundings.update(grounding)
        return groundings

[docs]    def stmts_from_annotation(self, annotation):
        """Return a list of Statements extracted from a single annotation."""
        text = annotation.get('text')
        if not text:
            return []
        parts = [t for t in text.split('\n') if t]
        text = parts[0]
        rp = self.reader(text)
        if not rp or not rp.statements:
            logger.warning('Could not extract any statements from %s'
                           % text)
            return []

        contexts = {}
        # We assume that all other parts are related to context
        for part in parts[1:]:
            context_dict = parse_context_entry(part, self.grounder, text)
            if context_dict:
                contexts.update(context_dict)
        bio_context = BioContext(**contexts) if contexts else None
        text_refs = get_text_refs(annotation['uri'])
        # In case we got multiple statements out, we apply the same
        # annotations to each
        for stmt in rp.statements:
            # There is expected to be exactly one evidence in all cases
            # but this is still a good way to work with it
            for ev in stmt.evidence:
                ev.source_api = 'hypothes.is'
                ev.text = text
                ev.text_refs = text_refs
                if 'PMID' in text_refs:
                    ev.pmid = text_refs['PMID']
                ev.annotations['hypothes.is'] = annotation
                ev.context = bio_context
        return rp.statements


[docs]def parse_context_entry(entry, grounder, sentence=None):
    """Return a dict of context type and object processed from an entry."""
    match = re.match(r'(.*): (.*)', entry)
    if not match:
        return None
    context_type, context_txt = match.groups()
    if context_type not in allowed_contexts:
        logger.warning('Unknown context type %s' % context_type)
        return None

    terms = grounder(context_txt, context=sentence)
    if not terms:
        logger.warning('Could not ground %s context: %s'
                       % (context_type, context_txt))
    db_refs = {}
    if terms:
        db_refs = standardize_db_refs({terms[0].term.db:
                                       terms[0].term.id})
    db_refs['TEXT'] = context_txt
    standard_name = None
    if terms:
        standard_name = bio_ontology.get_name(terms[0].term.db,
                                              terms[0].term.id)
    name = standard_name if standard_name else context_txt
    context = RefContext(name=name, db_refs=db_refs)
    return {allowed_contexts[context_type]: context}


[docs]def parse_grounding_entry(entry):
    """Return a dict representing single grounding curation entry string."""
    entry = entry.strip()
    # We now try to match the standard pattern for grounding curation
    match = re.match(r'^\[(.*)\] -> ([^ ]+)$', entry)
    # We log any instances of curations that don't match the pattern
    if not match:
        logger.warning('"%s" does not match the grounding curation '
                       'pattern.' % entry)
        return None
    txt, dbid_str = match.groups()
    # We now get a dict of curated mappings to return
    try:
        dbid_entries = [entry.split(':', maxsplit=1)
                        for entry in dbid_str.split('|')]
        dbids = {k: v for k, v in dbid_entries}
    except Exception as e:
        logger.warning('Could not interpret DB IDs: %s for %s' %
                       (dbid_str, txt))
        return None
    return {txt: dbids}


[docs]def get_text_refs(url):
    """Return the parsed out text reference dict from an URL."""
    text_refs = {'URL': url}
    match = re.match(r'https://www.ncbi.nlm.nih.gov/pubmed/(\d+)', url)
    if match:
        text_refs['PMID'] = match.groups()[0]
    match = re.match(r'https://www.ncbi.nlm.nih.gov/pmc/articles/(PMC\d+)/',
                     url)
    if match:
        text_refs['PMCID'] = match.groups()[0]
    match = re.match(r'https://www.biorxiv.org/content/([^v]+)v', url)
    if match:
        text_refs['DOI'] = match.groups()[0]
    return text_refs


allowed_contexts = {
    'Location': 'location',
    'Cell line': 'cell_line',
    'Cell type': 'cell_type',
    'Organ': 'organ',
    'Disease': 'disease',
    'Species': 'species'
}