Source code for indra.sources.reach.reach_api

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str, bytes
import os
import json
import logging
import tempfile
import requests
from indra.literature import id_lookup
import indra.literature.pmc_client as pmc_client
import indra.literature.pubmed_client as pubmed_client
from .processor import ReachProcessor
# Python 2
try:
    basestring
# Python 3
except:
    basestring = str

logger = logging.getLogger('reach')

try:
    # For offline reading
    from indra.java_vm import autoclass, JavaException
    from .reach_reader import ReachReader
    reach_reader = ReachReader()
    try_offline = True
except Exception:
    logger.error('Could not import jnius, offline reading cannot be used.')
    try_offline = False

reach_text_url = 'http://agathon.sista.arizona.edu:8080/odinweb/api/text'
reach_nxml_url = 'http://agathon.sista.arizona.edu:8080/odinweb/api/nxml'


[docs]def process_pmc(pmc_id, offline=False): """Return a ReachProcessor by processing a paper with a given PMC id. Uses the PMC client to obtain the full text. If it's not available, None is returned. Parameters ---------- pmc_id : str The ID of a PubmedCentral article. The string may start with PMC but passing just the ID also works. Examples: 3717945, PMC3717945 https://www.ncbi.nlm.nih.gov/pmc/ offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ xml_str = pmc_client.get_xml(pmc_id) if xml_str is None: return None fname = pmc_id + '.nxml' with open(fname, 'wb') as fh: fh.write(xml_str.encode('utf-8')) ids = id_lookup(pmc_id, 'pmcid') pmid = ids.get('pmid') rp = process_nxml_file(fname, citation=pmid, offline=offline) return rp
[docs]def process_pubmed_abstract(pubmed_id, offline=False): """Return a ReachProcessor by processing an abstract with a given Pubmed id. Uses the Pubmed client to get the abstract. If that fails, None is returned. Parameters ---------- pubmed_id : str The ID of a Pubmed article. The string may start with PMID but passing just the ID also works. Examples: 27168024, PMID27168024 https://www.ncbi.nlm.nih.gov/pubmed/ offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ abs_txt = pubmed_client.get_abstract(pubmed_id) if abs_txt is None: return None rp = process_text(abs_txt, citation=pubmed_id, offline=offline) if rp and rp.statements: for st in rp.statements: for ev in st.evidence: ev.epistemics['section_type'] = 'abstract' return rp
[docs]def process_text(text, citation=None, offline=False): """Return a ReachProcessor by processing the given text. Parameters ---------- text : str The text to be processed. citation : Optional[str] A PubMed ID passed to be used in the evidence for the extracted INDRA Statements. This is used when the text to be processed comes from a publication that is not otherwise identified. Default: None offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ if offline: if not try_offline: logger.error('Offline reading is not available.') return None api_ruler = reach_reader.get_api_ruler() if api_ruler is None: logger.error('Cannot read offline because the REACH ApiRuler ' + \ 'could not be instantiated.') return None try: result_map = api_ruler.annotateText(text, 'fries') except JavaException as e: logger.error('Could not process text.') logger.error(e) return None json_str = result_map.get('resultJson') if not isinstance(json_str, bytes): json_str = json_str.encode('utf-8') else: data = {'text': text.encode('utf-8')} try: res = requests.post(reach_text_url, data) except requests.exceptions.RequestException as e: logger.error('Could not connect to REACH service:') logger.error(e) return None # TODO: we could use res.json() here to get a dict # directly # This is a byte string json_str = res.content assert isinstance(json_str, bytes) with open('reach_output.json', 'wb') as fh: fh.write(json_str) return process_json_str(json_str.decode('utf-8'), citation)
[docs]def process_nxml_str(nxml_str, citation=None, offline=False): """Return a ReachProcessor by processing the given NXML string. NXML is the format used by PubmedCentral for papers in the open access subset. Parameters ---------- nxml_str : str The NXML string to be processed. citation : Optional[str] A PubMed ID passed to be used in the evidence for the extracted INDRA Statements. Default: None offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ if offline: if not try_offline: logger.error('Offline reading is not available.') return None api_ruler = reach_reader.get_api_ruler() if api_ruler is None: logger.error('Cannot read offline because the REACH ApiRuler' +\ 'could not be instantiated.') return None try: result_map = api_ruler.annotateNxml(nxml_str, 'fries') except JavaException as e: logger.error('Could not process NXML.') logger.error(e) return None json_str = result_map.get('resultJson') if isinstance(json_str, bytes): json_str = json_str.decode('utf-8') return process_json_str(json_str, citation) rp = process_nxml_file(fname, citation, True) return rp else: data = {'nxml': nxml_str} try: res = requests.post(reach_nxml_url, data) except requests.exceptions.RequestException as e: logger.error('Could not connect to REACH service:') logger.error(e) return None if res.status_code != 200: logger.error('Could not process NXML via REACH service.' + \ 'Status code: %d' % res.status_code) return None json_str = res.text with open('reach_output.json', 'wb') as fh: fh.write(json_str.encode('utf-8')) return process_json_str(json_str, citation)
[docs]def process_nxml_file(file_name, citation=None, offline=False): """Return a ReachProcessor by processing the given NXML file. NXML is the format used by PubmedCentral for papers in the open access subset. Parameters ---------- file_name : str The name of the NXML file to be processed. citation : Optional[str] A PubMed ID passed to be used in the evidence for the extracted INDRA Statements. Default: None offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ with open(file_name, 'rb') as f: nxml_str = f.read().decode('utf-8') return process_nxml_str(nxml_str, citation, False)
[docs]def process_json_file(file_name, citation=None): """Return a ReachProcessor by processing the given REACH json file. The output from the REACH parser is in this json format. This function is useful if the output is saved as a file and needs to be processed. For more information on the format, see: https://github.com/clulab/reach Parameters ---------- file_name : str The name of the json file to be processed. citation : Optional[str] A PubMed ID passed to be used in the evidence for the extracted INDRA Statements. Default: None Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ try: with open(file_name, 'rb') as fh: json_str = fh.read().decode('utf-8') return process_json_str(json_str, citation) except IOError: logger.error('Could not read file %s.' % file_name)
[docs]def process_json_str(json_str, citation=None): """Return a ReachProcessor by processing the given REACH json string. The output from the REACH parser is in this json format. For more information on the format, see: https://github.com/clulab/reach Parameters ---------- json_str : str The json string to be processed. citation : Optional[str] A PubMed ID passed to be used in the evidence for the extracted INDRA Statements. Default: None Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ assert isinstance(json_str, basestring) json_str = json_str.replace('frame-id','frame_id') json_str = json_str.replace('argument-label','argument_label') json_str = json_str.replace('object-meta','object_meta') json_str = json_str.replace('doc-id','doc_id') json_str = json_str.replace('is-hypothesis','is_hypothesis') json_str = json_str.replace('is-negated','is_negated') json_str = json_str.replace('is-direct','is_direct') json_str = json_str.replace('found-by','found_by') try: json_dict = json.loads(json_str) except ValueError: logger.error('Could not decode JSON string.') return None rp = ReachProcessor(json_dict, citation) rp.get_modifications() rp.get_complexes() rp.get_activation() rp.get_translocation() rp.get_regulate_amounts() return rp