__all__ = ['process_from_webservice',
'process_jsonl_file',
'process_jsonl_str',
'process_from_json_file',
'process_from_jsonish_str']
import json
import logging
import requests
from .processor import RlimspProcessor
logger = logging.getLogger(__name__)
RLIMSP_URL = ('https://research.bioinformatics.udel.edu/itextmine/api/data/'
'rlims/')
class RLIMSP_Error(Exception):
pass
[docs]def process_from_webservice(id_val, id_type='pmcid', source='pmc'):
"""Return an output from RLIMS-p for the given PubMed ID or PMC ID.
The web service is documented at: https://research.bioinformatics.udel.edu/itextmine/api/.
The /data/rlims URL endpoint is extended with three additional elements:
/{collection}/{key}/{value} where collection is "medline" or "pmc", key is
"pmid" or "pmcid", and value is a specific PMID or PMCID.
Parameters
----------
id_val : str
A PMCID, with the prefix PMC, or PMID, with no prefix, of the paper to
be "read". Corresponds to the "value" argument of the REST API.
id_type : Optional[str]
Either 'pmid' or 'pmcid'. The default is 'pmcid'. Corresponds to the
"key" argument of the REST API.
source : Optional[str]
Either 'pmc' or 'medline', whether you want pmc fulltext or medline
abstracts. Corresponds to the "collection" argument of the REST API.
Returns
-------
:py:class:`indra.sources.rlimsp.processor.RlimspProcessor`
An RlimspProcessor which contains a list of extracted INDRA Statements
in its statements attribute.
"""
resp = requests.get(RLIMSP_URL + '%s/%s/%s' % (source, id_type, id_val))
if resp.status_code != 200:
raise RLIMSP_Error("Bad status code: %d - %s"
% (resp.status_code, resp.reason))
rp = RlimspProcessor(resp.json())
rp.extract_statements()
return rp
[docs]def process_jsonl_file(filename, doc_id_type=None):
"""Process RLIMSP extractions from a bulk-download JSON-L file.
Parameters
----------
filename : str
Path to the JSON file.
doc_id_type : Optional[str]
In some cases the RLIMS-P paragraph info doesn't contain 'pmid' or
'pmcid' explicitly, instead if contains a 'docId' key. This parameter
allows defining what ID type 'docId' sould be interpreted as. Its
values should be 'pmid' or 'pmcid' or None if not used.
Returns
-------
:py:class:`indra.sources.rlimsp.processor.RlimspProcessor`
An RlimspProcessor which contains a list of extracted INDRA Statements
in its statements attribute.
"""
with open(filename, 'rt') as f:
json_list = [json.loads(line) for line in f.readlines()]
rp = RlimspProcessor(json_list, doc_id_type=doc_id_type)
rp.extract_statements()
return rp
[docs]def process_jsonl_str(jsonl_str, doc_id_type=None):
"""Process RLIMSP extractions from a JSON-L string.
Parameters
----------
jsonl_str : str
The contents of one of the JSON-L files you can find here:
https://hershey.dbi.udel.edu/textmining/export
doc_id_type : Optional[str]
In some cases the RLIMS-P paragraph info doesn't contain 'pmid' or
'pmcid' explicitly, instead if contains a 'docId' key. This parameter
allows defining what ID type 'docId' sould be interpreted as. Its
values should be 'pmid' or 'pmcid' or None if not used.
Returns
-------
:py:class:`indra.sources.rlimsp.processor.RlimspProcessor`
An RlimspProcessor which contains a list of extracted INDRA Statements
in its statements attribute.
"""
json_list = [json.loads(line) for line in jsonl_str.splitlines()]
rp = RlimspProcessor(json_list, doc_id_type=doc_id_type)
rp.extract_statements()
return rp
# DEPRECATED functions
[docs]def process_from_json_file(filename, doc_id_type=None):
"""DEPRECATED: use process_jsonl_file instead."""
logger.warning('process_from_json_file is deprecated. Use '
'process_jsonl_file instead.')
return process_jsonl_file(filename, doc_id_type=doc_id_type)
[docs]def process_from_jsonish_str(jsonish_str, doc_id_type=None):
"""DEPRECATED: use process_jsonl_str instead."""
logger.warning('process_jsonish_str is deprecated. Use '
'process_jsonl_str instead.')
return process_jsonl_str(jsonish_str, doc_id_type=doc_id_type)