Source code for indra.literature.adeft_tools

"""
This file provides several functions helpful for acquiring texts for Adeft
disambiguation.

It offers the ability to get text content for articles containing a
particular gene. This is useful for aquiring training texts for genes
genes that do not appear in a defining pattern with a problematic shortform.

General XML processing is also provided that allows for extracting
text from a source that may be either of Elsevier XML, NLM XML or raw text.
This is helpful because it avoids having to know in advance the source of
text content from the database.
"""

import re
import time
import logging

from indra.literature import pubmed_client, pmc_client, elsevier_client

logger = logging.getLogger(__name__)

# the elsevier_client will log messages that it is safe to ignore
el = logging.getLogger('indra.literature.elsevier_client')
el.setLevel(logging.WARNING)



[docs]
def get_text_content_for_gene(hgnc_name):
    """Get articles that have been annotated to contain gene in entrez

    Parameters
    ----------
    hgnc_name : str
       HGNC name for gene

    Returns
    -------
    text_content : list of str
        xmls of fulltext if available otherwise abstracts for all articles
        that haven been annotated in entrez to contain the given gene
    """
    pmids = pubmed_client.get_ids_for_gene(hgnc_name)
    return get_text_content_for_pmids(pmids)




[docs]
def get_text_content_for_pmids(pmids):
    """Get text content for articles given a list of their pmids

    Parameters
    ----------
    pmids : list of str

    Returns
    -------
    text_content : list of str
    """
    pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext'))

    pmc_ids = []
    for pmid in pmc_pmids:
        pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid']
        if pmc_id:
            pmc_ids.append(pmc_id)
        else:
            pmc_pmids.discard(pmid)

    pmc_xmls = []
    failed = set()
    for pmc_id in pmc_ids:
        if pmc_id is not None:
            pmc_xmls.append(pmc_client.get_xml_s3(pmc_id))
        else:
            failed.add(pmid)
        time.sleep(0.5)

    remaining_pmids = set(pmids) - pmc_pmids | failed
    abstracts = []
    for pmid in remaining_pmids:
        abstract = pubmed_client.get_abstract(pmid)
        abstracts.append(abstract)
        time.sleep(0.5)

    return [text_content for source in (pmc_xmls, abstracts)
            for text_content in source if text_content is not None]




[docs]
def universal_extract_paragraphs(xml):
    """Extract paragraphs from xml that could be from  different sources

    First try to parse the xml as if it came from elsevier. if we do not
    have valid elsevier xml this will throw an exception. the text extraction
    function in the pmc client may not throw an exception when parsing elsevier
    xml, silently processing the xml incorrectly

    Parameters
    ----------
    xml : str
       Either an NLM xml, Elsevier xml or plaintext

    Returns
    -------
    paragraphs : str
        Extracted plaintext paragraphs from NLM or Elsevier XML
    """
    try:
        paragraphs = elsevier_client.extract_paragraphs(xml)
    except Exception:
        paragraphs = None
    if paragraphs is None:
        try:
            paragraphs = pmc_client.extract_paragraphs(xml)
        except Exception:
            paragraphs = [xml]
    return paragraphs




[docs]
def filter_paragraphs(paragraphs, contains=None):
    """Filter paragraphs to only those containing one of a list of strings

    Parameters
    ----------
    paragraphs : list of str
        List of plaintext paragraphs from an article

    contains : str or list of str
        Exclude paragraphs not containing this string as a token, or
        at least one of the strings in contains if it is a list

    Returns
    -------
    str
        Plaintext consisting of all input paragraphs containing at least
        one of the supplied tokens.
    """
    if contains is None:
        pattern = ''
    else:
        if isinstance(contains, str):
            contains = [contains]
        pattern = '|'.join(r'[^\w]%s[^\w]' % shortform
                           for shortform in contains)
    paragraphs = [p for p in paragraphs if re.search(pattern, p)]
    return '\n'.join(paragraphs) + '\n'




[docs]
def universal_extract_text(xml, contains=None):
    """Extract plaintext from xml that could be from different sources

    Parameters
    ----------
    xml : str
        Either an NLM xml, Elsevier xml, or plaintext

    contains : str or list of str
         Exclude paragraphs not containing this string, or at least one
         of the strings in contains if it is a list

    Returns
    -------
    str
        The concatentation of all paragraphs in the input xml, excluding
        paragraphs not containing one of the tokens in the list contains.
        Paragraphs are separated by new lines.
    """
    paragraphs = universal_extract_paragraphs(xml)
    return filter_paragraphs(paragraphs, contains)