"""Methods for obtaining a reach processor containing indra statements.
Many file formats are supported. Many will run reach.
"""
import json
import logging
import requests
from indra.literature import id_lookup
import indra.literature.pmc_client as pmc_client
import indra.literature.pubmed_client as pubmed_client
from .processor import ReachProcessor
logger = logging.getLogger(__name__)
try:
# For offline reading
from .reader import ReachReader, ReachOfflineReadingError, JavaException
reach_reader = ReachReader()
try_offline = True
except Exception as e:
logger.warning('Could not import jnius, offline reading option will not '
'be available.')
logger.debug(e)
try_offline = False
reach_text_url = 'http://agathon.sista.arizona.edu:8080/odinweb/api/text'
reach_nxml_url = 'http://agathon.sista.arizona.edu:8080/odinweb/api/nxml'
local_text_url = 'http://localhost:8080/api/text'
local_nxml_url = 'http://localhost:8080/api/uploadFile'
default_output_fname = 'reach_output.json'
[docs]def process_pmc(pmc_id, offline=False, url=None,
output_fname=default_output_fname,
organism_priority=None):
"""Return a ReachProcessor by processing a paper with a given PMC id.
Uses the PMC client to obtain the full text. If it's not available,
None is returned.
Parameters
----------
pmc_id : str
The ID of a PubmedCentral article. The string may start with PMC but
passing just the ID also works.
Examples: 8511698, PMC8511698
https://www.ncbi.nlm.nih.gov/pmc/
offline : Optional[bool]
If set to True, the REACH system is run offline via a JAR file.
Otherwise (by default) the web service is called. Default: False
url : Optional[str]
URL for a REACH web service instance, which is used for reading if
provided. If not provided but offline is set to False (its default
value), the Arizona REACH web service is called
(http://agathon.sista.arizona.edu:8080/odinweb/api/help).
Default: None
output_fname : Optional[str]
The file to output the REACH JSON output to.
Defaults to reach_output.json in current working directory.
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
# Loading content from PMC first
logger.info('Loading %s from PMC' % pmc_id)
xml_str = pmc_client.get_xml(pmc_id)
if xml_str is None:
return None
# Write into a file in the working folder
fname = pmc_id + '.nxml'
with open(fname, 'wb') as fh:
fh.write(xml_str.encode('utf-8'))
# Try to get the PMID for the paper so that the evidence pmid
# attribute can be set correctly
logger.info('Looking up PMID for %s' % pmc_id)
ids = id_lookup(pmc_id, 'pmcid')
pmid = ids.get('pmid')
# Now process the NXML file with the provided arguments
logger.info('Processing %s with REACH' % pmc_id)
rp = process_nxml_file(fname, citation=pmid, offline=offline, url=url,
output_fname=output_fname,
organism_priority=organism_priority)
return rp
[docs]def process_pubmed_abstract(pubmed_id, offline=False, url=None,
output_fname=default_output_fname, **kwargs):
"""Return a ReachProcessor by processing an abstract with a given Pubmed id.
Uses the Pubmed client to get the abstract. If that fails, None is
returned.
Parameters
----------
pubmed_id : str
The ID of a Pubmed article. The string may start with PMID but
passing just the ID also works.
Examples: 27168024, PMID27168024
https://www.ncbi.nlm.nih.gov/pubmed/
offline : Optional[bool]
If set to True, the REACH system is run offline via a JAR file.
Otherwise (by default) the web service is called. Default: False
url : Optional[str]
URL for a REACH web service instance, which is used for reading if
provided. If not provided but offline is set to False (its default
value), the Arizona REACH web service is called
(http://agathon.sista.arizona.edu:8080/odinweb/api/help).
Default: None
output_fname : Optional[str]
The file to output the REACH JSON output to.
Defaults to reach_output.json in current working directory.
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
**kwargs : keyword arguments
All other keyword arguments are passed directly to `process_text`.
Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
# Get the abstract from PubMed, if that fails, return None
abs_txt = pubmed_client.get_abstract(pubmed_id)
if abs_txt is None:
return None
# Process the text with the provided arguments
rp = process_text(abs_txt, citation=pubmed_id, offline=offline, url=url,
output_fname=output_fname, **kwargs)
# For some applications, the section type of the text is important so
# that annotation is set here.
if rp and rp.statements:
for st in rp.statements:
for ev in st.evidence:
ev.epistemics['section_type'] = 'abstract'
return rp
[docs]def process_text(text, citation=None, offline=False, url=None,
output_fname=default_output_fname, timeout=None,
organism_priority=None):
"""Return a ReachProcessor by processing the given text.
Parameters
----------
text : str
The text to be processed.
citation : Optional[str]
A PubMed ID passed to be used in the evidence for the extracted INDRA
Statements. This is used when the text to be processed comes from
a publication that is not otherwise identified. Default: None
offline : Optional[bool]
If set to True, the REACH system is run offline via a JAR file.
Otherwise (by default) the web service is called. Default: False
url : Optional[str]
URL for a REACH web service instance, which is used for reading if
provided. If not provided but offline is set to False (its default
value), the Arizona REACH web service is called
(http://agathon.sista.arizona.edu:8080/odinweb/api/help).
Default: None
output_fname : Optional[str]
The file to output the REACH JSON output to.
Defaults to reach_output.json in current working directory.
timeout : Optional[float]
This only applies when reading online (`offline=False`). Only wait for
`timeout` seconds for the api to respond.
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
if offline:
json_str = _read_content_offline(text, 'text')
# If we are not reading offline then the old and new service interfaces
# are the same so we can use a shared function
else:
if url is None:
url = reach_text_url
json_str = _read_text_service(text, url, timeout)
if json_str:
with open(output_fname, 'wb') as fh:
fh.write(json_str)
return process_json_str(json_str.decode('utf-8'), citation=citation,
organism_priority=organism_priority)
[docs]def process_nxml_str(nxml_str, citation=None, offline=False,
url=None, output_fname=default_output_fname,
organism_priority=None):
"""Return a ReachProcessor by processing the given NXML string.
NXML is the format used by PubmedCentral for papers in the open
access subset.
Parameters
----------
nxml_str : str
The NXML string to be processed.
citation : Optional[str]
A PubMed ID passed to be used in the evidence for the extracted INDRA
Statements. Default: None
offline : Optional[bool]
If set to True, the REACH system is run offline via a JAR file.
Otherwise (by default) the web service is called. Default: False
url : Optional[str]
URL for a REACH web service instance, which is used for reading if
provided. If not provided but offline is set to False (its default
value), the Arizona REACH web service is called
(http://agathon.sista.arizona.edu:8080/odinweb/api/help).
Default: None
output_fname : Optional[str]
The file to output the REACH JSON output to.
Defaults to reach_output.json in current working directory.
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
if offline:
json_str = _read_content_offline(nxml_str, 'nxml')
else:
# Use the Arizona URL by default if not given
if url is None:
url = reach_nxml_url
# Print warning but proceed with reading
if url == reach_nxml_url:
logger.warning('Remote REACH webservice might get stuck when ' +
'processing NXML. Running local instance of REACH' +
' is recommended.')
json_str = _read_nxml_str_service_old(nxml_str, url)
# Otherwise we assume that the web service is more recent than the
# Arizona one and requires the new protocol.
else:
with open('temp_file.nxml', 'wb') as f:
f.write(nxml_str.encode('utf-8'))
json_str = _read_nxml_file_service_new('temp_file.nxml', url)
if json_str:
with open(output_fname, 'wb') as fh:
fh.write(json_str)
return process_json_str(json_str.decode('utf-8'), citation=citation,
organism_priority=organism_priority)
[docs]def process_nxml_file(file_name, citation=None, offline=False,
url=None, output_fname=default_output_fname,
organism_priority=None):
"""Return a ReachProcessor by processing the given NXML file.
NXML is the format used by PubmedCentral for papers in the open
access subset.
Parameters
----------
file_name : str
The name of the NXML file to be processed.
citation : Optional[str]
A PubMed ID passed to be used in the evidence for the extracted INDRA
Statements. Default: None
offline : Optional[bool]
If set to True, the REACH system is run offline via a JAR file.
Otherwise (by default) the web service is called. Default: False
url : Optional[str]
URL for a REACH web service instance, which is used for reading if
provided. If not provided but offline is set to False (its default
value), the Arizona REACH web service is called
(http://agathon.sista.arizona.edu:8080/odinweb/api/help).
Default: None
output_fname : Optional[str]
The file to output the REACH JSON output to.
Defaults to reach_output.json in current working directory.
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
# First, if we are reading offline, we read the file and proceed
if offline:
with open(file_name, 'rb') as f:
nxml_str = f.read().decode('utf-8')
json_str = _read_content_offline(nxml_str, 'nxml')
# If we are using the Arizona service, we use the old protocol
elif url is None or url == reach_nxml_url:
json_str = _read_nxml_file_service_old(file_name, url=reach_nxml_url)
# Otherwise we use the new protocol
else:
json_str = _read_nxml_file_service_new(file_name, url=url)
# Finally, we process the JSON output
if json_str:
with open(output_fname, 'wb') as fh:
fh.write(json_str)
return process_json_str(json_str.decode('utf-8'), citation=citation,
organism_priority=organism_priority)
[docs]def process_json_file(file_name, citation=None, organism_priority=None):
"""Return a ReachProcessor by processing the given REACH json file.
The output from the REACH parser is in this json format. This function is
useful if the output is saved as a file and needs to be processed.
For more information on the format, see: https://github.com/clulab/reach
Parameters
----------
file_name : str
The name of the json file to be processed.
citation : Optional[str]
A PubMed ID passed to be used in the evidence for the extracted INDRA
Statements. Default: None
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
try:
with open(file_name, 'rb') as fh:
json_str = fh.read().decode('utf-8')
return process_json_str(json_str, citation=citation,
organism_priority=organism_priority)
except IOError:
logger.error('Could not read file %s.' % file_name)
[docs]def process_fries_json_group(group_prefix, citation=None,
organism_priority=None):
"""Return a ReachProcessor by processing a REACH fries output file group.
When running REACH through its CLI, for each input file, it produces
three output JSON files when using the fries output format. These three
files jointly constitute the output, so they have to be combined to be
processed. For instance, one might have PMC9582577.uaz.entities.json,
PMC9582577.uaz.events.json, PMC9582577.uaz.sentence.json.
Parameters
----------
group_prefix : str
The prefix for the group of output files, e.g., PMC9582577.uaz
citation : Optional[str]
A PubMed ID passed to be used in the evidence for the extracted INDRA
Statements. Default: None
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
file_types = ['entities', 'events', 'sentences']
combined_json = {}
for file_type in file_types:
fname = '%s.%s.json' % (group_prefix, file_type)
with open(fname, 'r') as fh:
combined_json[file_type] = json.load(fh)
# Note that we serialize back to a JSON string here to make use of the
# replacements done in process_json_str below
return process_json_str(json.dumps(combined_json), citation=citation,
organism_priority=organism_priority)
[docs]def process_json_str(json_str, citation=None, organism_priority=None):
"""Return a ReachProcessor by processing the given REACH json string.
The output from the REACH parser is in this json format.
For more information on the format, see: https://github.com/clulab/reach
Parameters
----------
json_str : str
The json string to be processed.
citation : Optional[str]
A PubMed ID passed to be used in the evidence for the extracted INDRA
Statements. Default: None
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
Returns
-------
rp : ReachProcessor
A ReachProcessor containing the extracted INDRA Statements
in rp.statements.
"""
json_dict = _preprocess_json_str(json_str)
if json_dict is None:
return None
rp = ReachProcessor(json_dict, pmid=citation,
organism_priority=organism_priority)
rp.get_modifications()
rp.get_complexes()
rp.get_activation()
rp.get_translocation()
rp.get_regulate_amounts()
rp.get_conversion()
return rp
[docs]def process_agents_from_entities(file_name, organism_priority=None, with_coordinates=False):
"""Return INDRA Agents extracted from all entites, eve ones not appearing
in Statements.
Parameters
----------
file_name : str
The name of the json file to be processed.
organism_priority : Optional[list of str]
A list of Taxonomy IDs providing prioritization among organisms
when choosing protein grounding. If not given, the default behavior
takes the first match produced by Reach, which is prioritized to be
a human protein if such a match exists.
with_coordinates : Optional[bool]
If True, the Agents will be returned in a tuple with their
coordinates. Default: False
Returns
-------
list[Agent] :
A list of INDRA Agents processed from all extracted entities.
"""
with open(file_name, 'rb') as fh:
json_str = fh.read().decode('utf-8')
json_dict = _preprocess_json_str(json_str)
rp = ReachProcessor(json_dict, organism_priority=organism_priority)
if with_coordinates:
return rp.get_agents_from_entities_with_coords()
else:
return rp.get_agents_from_entities()
def _preprocess_json_str(json_str):
fields = ['frame-id', 'argument-label', 'object-meta',
'doc-id', 'is-hypothesis', 'is-negated',
'is-direct', 'found-by']
for field in fields:
json_str = json_str.replace(field, field.replace('-', '_'))
try:
json_dict = json.loads(json_str)
except ValueError as e:
logger.error('Could not decode JSON string.')
logger.exception(e)
return None
return json_dict
def _read_content_offline(content, content_type='text'):
"""Return a json string by processing the given text with offline
REACH reader.
Parameters
----------
content : str
The text to be processed.
content_type : str
Whether the content is a regular text or NXML.
Returns
-------
json_str : bytes
The json string produced by REACH reader.
"""
if not try_offline:
logger.error('Offline reading is not available.')
return None
try:
api_ruler = reach_reader.get_api_ruler()
except ReachOfflineReadingError as e:
logger.error(e)
logger.error('Cannot read offline because the REACH ApiRuler '
'could not be instantiated.')
return None
try:
if content_type == 'text':
result_map = api_ruler.annotateText(content, 'fries')
elif content_type == 'nxml':
result_map = api_ruler.annotateNxml(content, 'fries')
else:
raise ValueError('Invalid content_type: %s' % content_type)
except JavaException as e:
logger.error('Could not process %s.' % content_type)
logger.error(e)
return None
# REACH version < 1.3.3
json_str = result_map.get('resultJson')
if not json_str:
# REACH version >= 1.3.3
json_str = result_map.get('result')
if json_str is None:
logger.warning('No results retrieved')
return None
if not isinstance(json_str, bytes):
json_str = json_str.encode('utf-8')
return json_str
def _read_text_service(text, url=reach_text_url, timeout=None):
"""Return a json string by processing the given text with online REACH API.
Parameters
----------
text : str
The text to be processed.
url : Optional[str]
URL for REACH service. By default, Arizona REACH web service is called.
timeout : Optional[float]
Only wait for `timeout` seconds for the api to respond.
Returns
-------
json_str : bytes
The json string returned by REACH API.
"""
params = {'text': text.encode('utf-8')}
try:
res = requests.post(url, params=params, timeout=timeout)
except requests.exceptions.RequestException as e:
logger.error('Could not connect to REACH service:')
logger.error(e)
return None
# TODO: we could use res.json() here to get a dict
# directly
# This is a byte string
json_str = res.content
return json_str
def _read_nxml_file_service_old(nxml_file, url=reach_nxml_url):
with open(nxml_file, 'r', encoding='utf8') as fh:
nxml_str = fh.read()
return _read_nxml_str_service_old(nxml_str, url=url)
def _read_nxml_str_service_old(nxml_str, url=reach_nxml_url):
"""Return a json string by processing the given NXML string with remote
REACH webservice.
Parameters
----------
nxml_str : str
The NXML string to be processed.
url : Optional[str]
URL for REACH service. By default, Arizona REACH web service is called.
Returns
-------
json_str : bytes
The json string returned by REACH API.
"""
data = {'nxml': nxml_str}
try:
res = requests.post(url, data)
except requests.exceptions.RequestException as e:
logger.error('Could not connect to REACH service:')
logger.error(e)
return None
if res.status_code != 200:
logger.error('Could not process NXML via REACH service.'
+ 'Status code: %d' % res.status_code)
return None
json_str = res.content
return json_str
def _read_nxml_file_service_new(file_name, url=local_nxml_url):
"""Return a json string by processing the given NXML file with locally
running instance of REACH webservice.
Parameters
----------
file_name : str
The name of the NXML file to be processed.
url : Optional[str]
URL for REACH service. By default, localhost on port 8080 is called.
Returns
-------
json_str : bytes
The json string returned by REACH API.
"""
with open(file_name, 'rb') as f:
try:
res = requests.post(url, files={'file': f})
except requests.exceptions.RequestException as e:
logger.error('Could not connect to REACH service:')
logger.error(e)
return None
if res.status_code != 200:
logger.error('Could not process NXML via REACH service.'
+ 'Status code: %d' % res.status_code)
return None
json_str = res.content
return json_str