from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import re
import sys
import getopt
import xml.dom.minidom
import logging
import requests
logger = logging.getLogger(__name__)
base_url = 'http://trips.ihmc.us/parser/cgi/'
[docs]def send_query(text, service_endpoint='drum', query_args=None,
service_host=None):
"""Send a query to the TRIPS web service.
Parameters
----------
text : str
The text to be processed.
service_endpoint : Optional[str]
Selects the TRIPS/DRUM web service endpoint to use. Is a choice between
"drum" (default), "drum-dev", a nightly build, and "cwms" for use with
more general knowledge extraction.
query_args : Optional[dict]
A dictionary of arguments to be passed with the query.
service_host : Optional[str]
The server's base URL under which service_endpoint is an endpoint.
By default, IHMC's public server is used.
Returns
-------
html : str
The HTML result returned by the web service.
"""
use_base_url = service_host if service_host else base_url
if service_endpoint in ['drum', 'drum-dev', 'cwms', 'cwmsreader']:
url = use_base_url + service_endpoint
else:
logger.error('Invalid service endpoint: %s' % service_endpoint)
return ''
if query_args is None:
query_args = {}
query_args.update({'input': text})
res = requests.get(url, query_args, timeout=3600)
if not res.status_code == 200:
logger.error('Problem with TRIPS query: status code %s' %
res.status_code)
return ''
# Gets unicode content
return res.text
[docs]def get_xml(html, content_tag='ekb', fail_if_empty=False):
"""Extract the content XML from the HTML output of the TRIPS web service.
Parameters
----------
html : str
The HTML output from the TRIPS web service.
content_tag : str
The xml tag used to label the content. Default is 'ekb'.
fail_if_empty : bool
If True, and if the xml content found is an empty string, raise an
exception. Default is False.
Returns
-------
The extraction knowledge base (e.g. EKB) XML that contains the event and
term extractions.
"""
cont = re.findall(r'<%(tag)s(.*?)>(.*?)</%(tag)s>' % {'tag': content_tag},
html, re.MULTILINE | re.DOTALL)
if cont:
events_terms = ''.join([l.strip() for l in cont[0][1].splitlines()])
if 'xmlns' in cont[0][0]:
meta = ' '.join([l.strip() for l in cont[0][0].splitlines()])
else:
meta = ''
else:
events_terms = ''
meta = ''
if fail_if_empty:
assert events_terms != '',\
"Got empty string for events content from html:\n%s" % html
header = ('<?xml version="1.0" encoding="utf-8" standalone="yes"?><%s%s>'
% (content_tag, meta))
footer = '</%s>' % content_tag
return header + events_terms.replace('\n', '') + footer
[docs]def save_xml(xml_str, file_name, pretty=True):
"""Save the TRIPS EKB XML in a file.
Parameters
----------
xml_str : str
The TRIPS EKB XML string to be saved.
file_name : str
The name of the file to save the result in.
pretty : Optional[bool]
If True, the XML is pretty printed.
"""
try:
fh = open(file_name, 'wt')
except IOError:
logger.error('Could not open %s for writing.' % file_name)
return
if pretty:
xmld = xml.dom.minidom.parseString(xml_str)
xml_str_pretty = xmld.toprettyxml()
fh.write(xml_str_pretty)
else:
fh.write(xml_str)
fh.close()
if __name__ == '__main__':
filemode = False
text = 'Active BRAF phosphorylates MEK1 at Ser222.'
outfile_name = 'braf_test.xml'
opts, extraparams = getopt.getopt(sys.argv[1:], 's:f:o:h',
['string=', 'file=', 'output=', 'help'])
for o, p in opts:
if o in ['-h', '--help']:
print('String mode: python -m indra.sources.trips.client.py '
'--string "RAS binds GTP" --output text.xml')
print('File mode: python -m indra.sources.trips.client.py '
'--file test.txt --output text.xml')
sys.exit()
elif o in ['-s', '--string']:
text = p
elif o in ['-f', '--file']:
filemode = True
infile_name = p
elif o in ['-o', '--output']:
outfile_name = p
if filemode:
try:
fh = open(infile_name, 'rt')
except IOError:
print('Could not open %s.' % infile_name)
exit()
text = fh.read()
fh.close()
print('Parsing contents of %s...' % infile_name)
else:
print('Parsing string: %s' % text)
html = send_query(text)
xml = get_xml(html)
save_xml(xml, outfile_name)