Source code for indra.sources.rlimsp.api

__all__ = ['process_from_webservice',
           'process_jsonl_file',
           'process_jsonl_str',
           'process_from_json_file',
           'process_from_jsonish_str']

import os
import json
import logging
import requests
from multiprocessing import Pool

from .processor import RlimspProcessor


logger = logging.getLogger(__name__)


RLIMSP_URL = ('https://research.bioinformatics.udel.edu/itextmine/api/data/'
              'rlims/')


class RLIMSP_Error(Exception):
    pass



[docs]
def process_from_webservice(id_val, id_type='pmcid', source='pmc'):
    """Return an output from RLIMS-p for the given PubMed ID or PMC ID.

    The web service is documented at: https://research.bioinformatics.udel.edu/itextmine/api/.
    The /data/rlims URL endpoint is extended with three additional elements:
    /{collection}/{key}/{value} where collection is "medline" or "pmc", key is
    "pmid" or "pmcid", and value is a specific PMID or PMCID.

    Parameters
    ----------
    id_val : str
        A PMCID, with the prefix PMC, or PMID, with no prefix, of the paper to
        be "read". Corresponds to the "value" argument of the REST API.
    id_type : Optional[str]
        Either 'pmid' or 'pmcid'. The default is 'pmcid'. Corresponds to the
        "key" argument of the REST API.
    source : Optional[str]
        Either 'pmc' or 'medline', whether you want pmc fulltext or medline
        abstracts. Corresponds to the "collection" argument of the REST API.

    Returns
    -------
    :py:class:`indra.sources.rlimsp.processor.RlimspProcessor`
        An RlimspProcessor which contains a list of extracted INDRA Statements
        in its statements attribute.
    """
    resp = requests.get(RLIMSP_URL + '%s/%s/%s' % (source, id_type, id_val))

    if resp.status_code != 200:
        raise RLIMSP_Error("Bad status code: %d - %s"
                           % (resp.status_code, resp.reason))

    rp = RlimspProcessor(resp.json())
    rp.extract_statements()
    return rp




[docs]
def process_jsonl_file(filename, doc_id_type=None):
    """Process RLIMSP extractions from a bulk-download JSON-L file.

    Parameters
    ----------
    filename : str
        Path to the JSON file.
    doc_id_type : Optional[str]
        In some cases the RLIMS-P paragraph info doesn't contain 'pmid' or
        'pmcid' explicitly, instead if contains a 'docId' key. This parameter
        allows defining what ID type 'docId' sould be interpreted as. Its
        values should be 'pmid' or 'pmcid' or None if not used.

    Returns
    -------
    :py:class:`indra.sources.rlimsp.processor.RlimspProcessor`
        An RlimspProcessor which contains a list of extracted INDRA Statements
        in its statements attribute.
    """
    with open(filename, 'rt') as f:
        json_list = [json.loads(line) for line in f.readlines()]
        rp = RlimspProcessor(json_list, doc_id_type=doc_id_type)
        rp.extract_statements()
    return rp


def process_line(line):
    try:
        return json.loads(line)
    except json.JSONDecodeError:
        return None


[docs]
def process_jsonl_str(jsonl_str, doc_id_type=None, num_processes=None):
    """Process RLIMSP extractions from a JSON-L string.

    Parameters
    ----------
    jsonl_str : str
        The contents of one of the JSON-L files you can find here:
        https://hershey.dbi.udel.edu/textmining/export
    doc_id_type : Optional[str]
        In some cases the RLIMS-P paragraph info doesn't contain 'pmid' or
        'pmcid' explicitly, instead if contains a 'docId' key. This parameter
        allows defining what ID type 'docId' sould be interpreted as. Its
        values should be 'pmid' or 'pmcid' or None if not used.
    num_processes : Optional[int]
        The number of processes to use for parallel processing of the JSON-L
        lines. If None, the number of processes is set to the number of CPUs
        on the machine. If 1, no parallel processing is done.
        Otherwise the provided nmber of processes is used.

    Returns
    -------
    :py:class:`indra.sources.rlimsp.processor.RlimspProcessor`
        An RlimspProcessor which contains a list of extracted INDRA Statements
        in its statements attribute.
    """
    if num_processes is None:
        num_processes = os.cpu_count()
    if num_processes > 1:
        with Pool(num_processes) as pool:
            json_list = pool.map(process_line, jsonl_str.splitlines())
    else:
        json_list = [process_line(line) for line in jsonl_str.splitlines()]
    json_list = [obj for obj in json_list if obj is not None]
    rp = RlimspProcessor(json_list, doc_id_type=doc_id_type)
    rp.extract_statements()
    return rp



# DEPRECATED functions


[docs]
def process_from_json_file(filename, doc_id_type=None):
    """DEPRECATED: use process_jsonl_file instead."""
    logger.warning('process_from_json_file is deprecated. Use '
                   'process_jsonl_file instead.')
    return process_jsonl_file(filename, doc_id_type=doc_id_type)




[docs]
def process_from_jsonish_str(jsonish_str, doc_id_type=None):
    """DEPRECATED: use process_jsonl_str instead."""
    logger.warning('process_jsonish_str is deprecated. Use '
                   'process_jsonl_str instead.')
    return process_jsonl_str(jsonish_str, doc_id_type=doc_id_type)