Source code for indra.sources.eidos.processor

import re
import copy
import logging
import objectpath
from indra.statements import *


logger = logging.getLogger(__name__)


[docs]class EidosProcessor(object): """This processor extracts INDRA Statements from Eidos JSON-LD output. Parameters ---------- json_dict : dict A JSON dictionary containing the Eidos extractions in JSON-LD format. Attributes ---------- statements : list[indra.statements.Statement] A list of INDRA Statements that were extracted by the processor. """ def __init__(self, json_dict): self.doc = EidosDocument(json_dict) self.statements = []
[docs] def extract_causal_relations(self): """Extract causal relations as Statements.""" # Get the extractions that are labeled as directed and causal relations = [e for e in self.doc.extractions if 'DirectedRelation' in e['labels'] and 'Causal' in e['labels']] # For each relation, we try to extract an INDRA Statement and # save it if its valid for relation in relations: stmt = self.get_causal_relation(relation) if stmt is not None: self.statements.append(stmt)
[docs] def extract_correlations(self): """Extract correlations as Assocation statements.""" events = [e for e in self.doc.extractions if 'UndirectedRelation' in e['labels'] and 'Correlation' in e['labels']] for event in events: # For now, just take the first source and first destination. # Later, might deal with hypergraph representation. arg_ids = find_args(event, 'argument') if len(arg_ids) != 2: logger.warning('Skipping correlation with not 2 arguments.') # Resolve coreferences by ID arg_ids = [self.doc.coreferences.get(arg_id, arg_id) for arg_id in arg_ids] # Get the actual entities args = [self.doc.entities[arg_id] for arg_id in arg_ids] # Make Events from the entities members = [self.get_event(arg) for arg in args] # Get the evidence evidence = self.get_evidence(event) st = Association(members, evidence=[evidence]) self.statements.append(st)
[docs] def extract_events(self): """Extract Events that are not arguments of other statements.""" self._extract_event_by_label({'Concept-Expanded'})
[docs] def extract_all_events(self): """Extract all events, including ones that are arguments of other statements. The goal of this method is to extract events as standalone statements with their own dedicated evidence. This is different from the get_all_events method in that it extracts the event-specific evidence for each Event statement instead of propagating causal relation evidence into the Event after initial extraction. """ self._extract_event_by_label({'Concept', 'Concept-Expanded'})
def _extract_event_by_label(self, event_labels): events = [e for e in self.doc.extractions if event_labels & set(e['labels'])] for event_entry in events: event = self.get_event(event_entry) evidence = self.get_evidence(event_entry) event.evidence = [evidence] if not event.context and evidence.context: event.context = copy.deepcopy(evidence.context) evidence.context = None self.statements.append(event) def get_event_by_id(self, event_id): # Resolve coreferences by ID event_id = self.doc.coreferences.get(event_id, event_id) # Get the actual entity event = self.doc.entities[event_id] return self.get_event(event) def get_event(self, event): concept = self.get_concept(event) states = event.get('states', []) extracted_states = self.extract_entity_states(states) polarity = extracted_states.get('polarity') adjectives = extracted_states.get('adjectives') delta = QualitativeDelta(polarity=polarity, adjectives=adjectives) stmt = Event(concept, delta=delta) return stmt def get_causal_relation(self, relation): # For now, just take the first source and first destination. # Later, might deal with hypergraph representation. subj_id = find_arg(relation, 'source') obj_id = find_arg(relation, 'destination') if subj_id is None or obj_id is None: return None subj = self.get_event_by_id(subj_id) obj = self.get_event_by_id(obj_id) evidence = self.get_evidence(relation) # We also put the adjectives and polarities into annotations since # they could otherwise get squashed upon preassembly evidence.annotations['subj_polarity'] = subj.delta.polarity evidence.annotations['obj_polarity'] = obj.delta.polarity evidence.annotations['subj_adjectives'] = subj.delta.adjectives evidence.annotations['obj_adjectives'] = obj.delta.adjectives evidence.annotations['subj_context'] = subj.context.to_json() if \ subj.context else {} evidence.annotations['obj_context'] = obj.context.to_json() if \ obj.context else {} st = Influence(subj, obj, evidence=[evidence]) return st
[docs] def get_evidence(self, relation): """Return the Evidence object for the INDRA Statment.""" provenance = relation.get('provenance') # First try looking up the full sentence through provenance text = None context = None if provenance: sentence_tag = provenance[0].get('sentence') if sentence_tag and '@id' in sentence_tag: sentence_id = sentence_tag['@id'] sentence = self.doc.sentences.get(sentence_id) if sentence is not None: text = _sanitize(sentence['text']) # Here we try to get the title of the document and set it # in the provenance doc_id = provenance[0].get('document', {}).get('@id') if doc_id: title = self.doc.documents.get(doc_id, {}).get('title') if title: provenance[0]['document']['title'] = title annotations = {'found_by': relation.get('rule'), 'provenance': provenance} if self.doc.dct is not None: annotations['document_creation_time'] = self.doc.dct.to_json() epistemics = {} negations = self.get_negation(relation) hedgings = self.get_hedging(relation) if hedgings: epistemics['hedgings'] = hedgings if negations: # This is the INDRA standard to show negation epistemics['negated'] = True # But we can also save the texts associated with the negation # under annotations, just in case it's needed annotations['negated_texts'] = negations # If that fails, we can still get the text of the relation if text is None: text = _sanitize(relation.get('text')) ev = Evidence(source_api='eidos', text=text, annotations=annotations, context=context, epistemics=epistemics) return ev
[docs] @staticmethod def get_negation(event): """Return negation attached to an event. Example: "states": [{"@type": "State", "type": "NEGATION", "text": "n't"}] """ states = event.get('states', []) if not states: return [] negs = [state for state in states if state.get('type') == 'NEGATION'] neg_texts = [neg['text'] for neg in negs] return neg_texts
[docs] @staticmethod def get_hedging(event): """Return hedging markers attached to an event. Example: "states": [{"@type": "State", "type": "HEDGE", "text": "could"} """ states = event.get('states', []) if not states: return [] hedgings = [state for state in states if state.get('type') == 'HEDGE'] hedging_texts = [hedging['text'] for hedging in hedgings] return hedging_texts
def extract_entity_states(self, states): if states is None: return {'polarity': None, 'adjectives': []} polarity = None adjectives = [] for state in states: if polarity is None: if state['type'] == 'DEC': polarity = -1 # Handle None entry here mods = state.get('modifiers') if \ state.get('modifiers') else [] adjectives += [mod['text'] for mod in mods] elif state['type'] == 'INC': polarity = 1 mods = state.get('modifiers') if \ state.get('modifiers') else [] adjectives += [mod['text'] for mod in mods] elif state['type'] == 'QUANT': adjectives.append(state['text']) return {'polarity': polarity, 'adjectives': adjectives}
[docs] def get_groundings(self, entity): """Return groundings as db_refs for an entity.""" return {'TEXT': entity['text']}
[docs] def get_concept(self, entity): """Return Concept from an Eidos entity.""" # Use the canonical name as the name of the Concept name = entity['canonicalName'] db_refs = self.get_groundings(entity) concept = Concept(name, db_refs=db_refs) return concept
[docs] def get_all_events(self): """Return a list of all standalone events from the existing list of extracted statements. Note that this method only operates on statements already extracted into the processor's statements attribute. Note also that the evidences for events created from Influences and Assocations here are propagated from those statements; they are not equivalent to the original evidences for the events themselves (see extract_all_events method). Returns ------- events : list[indra.statements.Event] A list of Events from original Events, and unrolled from Influences and Associations. """ events = [] for stmt in self.statements: stmt = copy.deepcopy(stmt) if isinstance(stmt, Influence): for member in [stmt.subj, stmt.obj]: member.evidence = stmt.evidence[:] # Remove the context since it may be for the other member for ev in member.evidence: ev.context = None events.append(member) elif isinstance(stmt, Association): for member in stmt.members: member.evidence = stmt.evidence[:] # Remove the context since it may be for the other member for ev in member.evidence: ev.context = None events.append(member) elif isinstance(stmt, Event): events.append(stmt) return events
class EidosDocument(object): def __init__(self, json_dict): self.tree = objectpath.Tree(json_dict) self.extractions = [] self.sentences = {} self.entities = {} self.documents = {} self.coreferences = {} self.dct = None self._preprocess_extractions() def _preprocess_extractions(self): extractions = \ self.tree.execute("$.extractions[(@.@type is 'Extraction')]") if not extractions: return # Listify for multiple reuse self.extractions = list(extractions) # Build a dictionary of entities entities = [e for e in self.extractions if 'Concept' in e.get('labels', [])] self.entities = {entity['@id']: entity for entity in entities} # Build a dictionary of sentences and document creation times (DCTs) documents = self.tree.execute("$.documents[(@.@type is 'Document')]") self.sentences = {} for document in documents: title = document.get('title') self.documents[document['@id']] = {'title': title} # We stash the DCT here as a TimeContext object sentences = document.get('sentences', []) for sent in sentences: self.sentences[sent['@id']] = sent # Build a dictionary of coreferences for extraction in self.extractions: if 'Coreference' in extraction['labels']: reference = find_arg(extraction, 'reference') anchor = find_arg(extraction, 'anchor') self.coreferences[reference] = anchor def _sanitize(text): """Return sanitized Eidos text field for human readability.""" d = {'-LRB-': '(', '-RRB-': ')'} return re.sub('|'.join(d.keys()), lambda m: d[m.group(0)], text)
[docs]def find_arg(event, arg_type): """Return ID of the first argument of a given type""" obj_ids = find_args(event, arg_type) if not obj_ids: return None else: return obj_ids[0]
[docs]def find_args(event, arg_type): """Return IDs of all arguments of a given type""" args = event.get('arguments', {}) obj_tags = [arg for arg in args if arg['type'] == arg_type] if obj_tags: return [o['value']['@id'] for o in obj_tags] else: return []