Source code for indra.tools.hypothesis_annotator

"""This module exposes functions that annotate websites (including
PubMed and PubMedCentral pages, or any other text-based website) with INDRA
Statements through hypothes.is. Features include reading the content of the
website 'de-novo', and generating new INDRA Statements for annotation, and
fetching existing statements for a paper from the INDRA DB and using
those for annotation."""
import logging
import requests
from indra.sources import indra_db_rest
from indra.literature import pubmed_client
from indra.pipeline import AssemblyPipeline
from indra.statements import stmts_from_json
from indra.sources.hypothesis import upload_statement_annotation

logger = logging.getLogger(__name__)



[docs]
def annotate_paper_from_db(text_refs, assembly_pipeline=None):
    """Upload INDRA Statements as annotations for a given paper based on content
    for that paper in the INDRA DB.

    Parameters
    ----------
    text_refs : dict
        A dict of text references, following the same format as
        the INDRA Evidence text_refs attribute.
    assembly_pipeline : Optional[json]
        A list of pipeline steps (typically filters) that are applied
        before uploading statements to hypothes.is as annotations.
    """
    ref_priority = ['TRID', 'PMCID', 'PMID']
    for ref_ns in ref_priority:
        ref_id = text_refs.get(ref_ns)
        if ref_id:
            break
    else:
        logger.info('Could not find appropriate text refs')
        return
    ip = indra_db_rest.get_statements_for_paper([(ref_ns.lower(), ref_id)])
    stmts = ip.statements
    # Cut down evidences to ones just from this paper
    for stmt in stmts:
        stmt.evidence = [ev for ev in stmt.evidence if
                         ev.text_refs.get(ref_ns) == ref_id]
    if assembly_pipeline:
        ap = AssemblyPipeline(assembly_pipeline)
        stmts = ap.run(stmts)

    logger.info('Uploading %d statements to hypothes.is' % len(stmts))
    for stmt in stmts:
        upload_statement_annotation(stmt, annotate_agents=True)




[docs]
def read_and_annotate(text_refs, text_extractor=None,
                      text_reader=None, assembly_pipeline=None):
    """Read a paper/website and upload annotations derived from it to
    hypothes.is.

    Parameters
    ----------
    text_refs : dict
        A dict of text references, following the same format as
        the INDRA Evidence text_refs attribute.
    text_extractor : Optional[function]
        A function which takes the raw content of a website (e.g., HTML)
        and extracts clean text from it to prepare for machine reading.
        This is only used if the text_refs is a URL (e.g., a Wikipedia page),
        it is not used for PMID or PMCID text_refs where content can be
        pre-processed and machine read directly. Default: None
        Example: html2text.HTML2Text().handle
    text_reader : Optional[function]
        A function which takes a single text string argument (the
        text extracted from a given resource), runs reading on it, and
        returns a list of INDRA Statement objects. Due to complications with
        the PMC NXML format, this option only supports URL or PMID resources
        as input in text_refs. Default: None. In the
        default case, the INDRA REST API is called with an appropriate
        endpoint that runs Reach and processes its output into INDRA
        Statements.
    assembly_pipeline : Optional[json]
        A list of assembly pipeline steps that are applied before uploading
        statements to hypothes.is as annotations.
        Example: [{'function': 'map_grounding'}]
    """
    api_url = 'http://api.indra.bio:8000/reach/'
    ref_priority = ['PMCID', 'PMID', 'URL'] if not text_reader \
        else ['PMID', 'URL']
    for ref_ns in ref_priority:
        ref_id = text_refs.get(ref_ns)
        if ref_id:
            break
    else:
        logger.info('Could not find appropriate text refs')
        return
    logger.info('Selected the following paper ID: %s:%s' % (ref_ns, ref_id))
    # Get text content and the read the text
    if ref_ns == 'PMCID':
        res = requests.post(api_url + 'process_pmc', json={'pmc_id': ref_id})
        stmts = stmts_from_json(res.json().get('statements'))
    elif ref_ns == 'PMID':
        abstract = pubmed_client.get_abstract(ref_id)
        if not abstract:
            logger.info('Could not get abstract from PubMed')
            return
        logger.info('Got abstract')
        if text_reader:
            stmts = text_reader(abstract)
        else:
            res = requests.post(api_url + 'process_text', json={'text': abstract})
            stmts = stmts_from_json(res.json().get('statements'))
    elif ref_ns == 'URL':
        site_content = requests.get(ref_id).text
        if not site_content:
            logger.info('Could not get content from website')
            return
        if text_extractor:
            text = text_extractor(site_content)
            logger.info('Extracted text of length %d from site content' %
                        len(text))
        else:
            text = site_content
        if text_reader:
            stmts = text_reader(text)
        else:
            res = requests.post(api_url + 'process_text', json={'text': text})
            stmts = stmts_from_json(res.json().get('statements'))
    else:
        return

    logger.info('Got %d statements from reading' % len(stmts))
    if not stmts:
        return

    if assembly_pipeline:
        ap = AssemblyPipeline(assembly_pipeline)
        stmts = ap.run(stmts)

    logger.info('Uploading %d statements to hypothes.is' % len(stmts))
    for stmt in stmts:
        for ev in stmt.evidence:
            if ref_ns == 'PMID':
                ev.pmid = ref_id
            ev.text_refs[ref_ns] = ref_id
        upload_statement_annotation(stmt, annotate_agents=True)