import os
import json
import logging
from copy import deepcopy
import indra.statements as ist
from indra.preassembler.grounding_mapper.gilda import ground_statements
logger = logging.getLogger(__name__)
[docs]class IsiProcessor(object):
"""Processes the output of the ISI reader.
Parameters
----------
reader_output : json
The output JSON of the ISI reader as a json object.
pmid : Optional[str]
The PMID to assign to the extracted Statements
extra_annotations : Optional[dict]
Annotations to be included with each extracted Statement
add_grounding : Optional[bool]
If True, Gilda is used as a service to ground the Agents in
the extracted Statements.
Attributes
----------
verbs : set[str]
A list of verbs that have appeared in the processed ISI output
statements : list[indra.statements.Statement]
Extracted statements
"""
def __init__(self, reader_output, pmid=None, extra_annotations=None,
add_grounding=False):
self.reader_output = reader_output
self.pmid = pmid
self.extra_annotations = extra_annotations if \
extra_annotations is not None else {}
self.verbs = set()
self.statements = []
self.add_grounding = add_grounding
[docs] def get_statements(self):
"""Process reader output to produce INDRA Statements."""
for k, v in self.reader_output.items():
for interaction in v['interactions']:
self._process_interaction(k, interaction, v['text'], self.pmid,
self.extra_annotations)
if self.add_grounding:
ground_statements(self.statements)
def _process_interaction(self, source_id, interaction, text, pmid,
extra_annotations):
"""Process an interaction JSON tuple from the ISI output, and adds up
to one statement to the list of extracted statements.
Parameters
----------
source_id : str
the JSON key corresponding to the sentence in the ISI output
interaction: the JSON list with subject/verb/object information
about the event in the ISI output
text : str
the text of the sentence
pmid : str
the PMID of the article from which the information was extracted
extra_annotations : dict
Additional annotations to add to the statement's evidence,
potentially containing metadata about the source. Annotations
with the key "interaction" will be overridden by the JSON
interaction tuple from the ISI output
"""
# Note: interaction[1] is a catalyst, but unused due to a lack of ways
# to represent it with Statements.
verb = interaction[0].lower()
subj = interaction[-2]
obj = interaction[-1]
# Make ungrounded agent objects for the subject and object
# Grounding will happen after all statements are extracted in __init__
subj = self._make_agent(subj)
obj = self._make_agent(obj)
# Make an evidence object
annotations = deepcopy(extra_annotations)
if 'interaction' in extra_annotations:
logger.warning("'interaction' key of extra_annotations ignored" +
" since this is reserved for storing the raw ISI " +
"input.")
annotations['source_id'] = source_id
annotations['interaction'] = interaction
ev = ist.Evidence(source_api='isi',
pmid=pmid,
text=text.rstrip(),
annotations=annotations)
# Add the verb to the set of verbs.
self.verbs.add(verb)
statement = None
if verb in verb_to_statement_type:
statement_class = verb_to_statement_type[verb]
if statement_class == ist.Complex:
statement = ist.Complex([subj, obj], evidence=ev)
else:
statement = statement_class(subj, obj, evidence=ev)
if statement is not None:
# For Complex statements, the ISI reader produces two events:
# binds(A, B) and binds(B, A)
# We want only one Complex statement for each sentence, so check
# to see if we already have a Complex for this source_id with the
# same members
already_have = False
if type(statement) == ist.Complex:
for old_s in self.statements:
old_id = statement.evidence[0].source_id
new_id = old_s.evidence[0].source_id
if type(old_s) == ist.Complex and old_id == new_id:
old_statement_members = \
[m.db_refs['TEXT'] for m in old_s.members]
old_statement_members = sorted(old_statement_members)
new_statement_members = [m.db_refs['TEXT']
for m in statement.members]
new_statement_members = sorted(new_statement_members)
if old_statement_members == new_statement_members:
already_have = True
break
if not already_have:
self.statements.append(statement)
@staticmethod
def _make_agent(agent_str):
"""Makes an ungrounded Agent object from a string specifying an
entity.
Parameters
----------
agent_str : str
A string specifying the agent
Returns
-------
agent : indra.statements.Agent
An ungrounded Agent object referring to the specified text
"""
return ist.Agent(agent_str, db_refs={'TEXT': agent_str})
[docs] def retain_molecular_complexes(self):
"""Filter the statements to Complexes between molecular entities."""
self.statements = [s for s in self.statements
if isinstance(s, ist.Complex) and
all(is_molecular(m) for m in s.members)]
def is_molecular(agent):
if agent is None:
return False
db, id = agent.get_grounding()
return (db is not None and db in {'HGNC', 'UP', 'CHEBI', 'PUBCHEM',
'UPPRO', 'FPLX'})
# Load the mapping between ISI verb and INDRA statement type
def _build_verb_statement_mapping():
"""Build the mapping between ISI verb strings and INDRA statement classes.
Looks up the INDRA statement class name, if any, in a resource file,
and resolves this class name to a class.
Returns
-------
verb_to_statement_type : dict
Dictionary mapping verb name to an INDRA statment class
"""
path_this = os.path.dirname(os.path.abspath(__file__))
map_path = os.path.join(path_this, 'isi_verb_to_indra_statement_type.tsv')
with open(map_path, 'r') as f:
first_line = True
verb_to_statement_type = {}
for line in f:
if not first_line:
line = line[:-1]
tokens = line.split('\t')
if len(tokens) == 2 and len(tokens[1]) > 0:
verb = tokens[0]
s_type = tokens[1]
try:
statement_class = getattr(ist, s_type)
verb_to_statement_type[verb] = statement_class
except Exception:
pass
else:
first_line = False
return verb_to_statement_type
verb_to_statement_type = _build_verb_statement_mapping()