Source code for indra.sources.medscan.api

from collections import defaultdict

from .processor import *


logger = logging.getLogger(__name__)


[docs]def process_directory_statements_sorted_by_pmid(directory_name):
    """Processes a directory filled with CSXML files, first normalizing the
    character encoding to utf-8, and then processing into INDRA statements
    sorted by pmid.

    Parameters
    ----------
    directory_name : str
        The name of a directory filled with csxml files to process

    Returns
    -------
    pmid_dict : dict
        A dictionary mapping pmids to a list of statements corresponding to
        that pmid
    """
    s_dict = defaultdict(list)
    mp = process_directory(directory_name, lazy=True)

    for statement in mp.iter_statements():
        s_dict[statement.evidence[0].pmid].append(statement)
    return s_dict


[docs]def process_directory(directory_name, lazy=False):
    """Processes a directory filled with CSXML files, first normalizing the
    character encodings to utf-8, and then processing into a list of INDRA
    statements.

    Parameters
    ----------
    directory_name : str
        The name of a directory filled with csxml files to process
    lazy : bool
        If True, the statements will not be generated immediately, but rather
        a generator will be formulated, and statements can be retrieved by
        using `iter_statements`. If False, the `statements` attribute will be
        populated immediately. Default is False.

    Returns
    -------
    mp : indra.sources.medscan.processor.MedscanProcessor
        A MedscanProcessor populated with INDRA statements extracted from the
        csxml files
    """

    # Parent Medscan processor containing extractions from all files
    mp = MedscanProcessor()
    mp.process_directory(directory_name, lazy)
    return mp


[docs]def process_file_sorted_by_pmid(file_name):
    """Processes a file and returns a dictionary mapping pmids to a list of
    statements corresponding to that pmid.

    Parameters
    ----------
    file_name : str
        A csxml file to process

    Returns
    -------
    s_dict : dict
        Dictionary mapping pmids to a list of statements corresponding to
        that pmid
    """
    s_dict = defaultdict(list)
    mp = process_file(file_name, lazy=True)

    for statement in mp.iter_statements():
        s_dict[statement.evidence[0].pmid].append(statement)
    return s_dict


[docs]def process_file(filename, interval=None, lazy=False):
    """Process a CSXML file for its relevant information.

    Consider running the fix_csxml_character_encoding.py script in
    indra/sources/medscan to fix any encoding issues in the input file before
    processing.

    Attributes
    ----------
    filename : str
        The csxml file, containing Medscan XML, to process
    interval : (start, end) or None
        Select the interval of documents to read, starting with the
        `start`th document and ending before the `end`th document. If
        either is None, the value is considered undefined. If the value
        exceeds the bounds of available documents, it will simply be
        ignored.
    lazy : bool
        If True, the statements will not be generated immediately, but rather
        a generator will be formulated, and statements can be retrieved by
        using `iter_statements`. If False, the `statements` attribute will be
        populated immediately. Default is False.

    Returns
    -------
    mp : MedscanProcessor
        A MedscanProcessor object containing extracted statements
    """
    mp = MedscanProcessor()
    mp.process_csxml_file(filename, interval, lazy)
    return mp