Source code for indra.sources.eidos.reader

import os
import json
import datetime
from indra import get_config


# Before the import, we have to deal with the CLASSPATH to avoid clashes
# with REACH.
def _set_classpath():
    clp = os.environ.get('CLASSPATH')
    eip = get_config('EIDOSPATH')
    rep = get_config('REACHPATH')
    clp_parts = clp.split(':') if clp else []
    new_clp_parts = []
    has_eidos = False
    # Look at all the parts of the CLASSPATH
    for part in clp_parts:
        # If REACH is on the CLASSPATH, remove it
        if not rep or os.path.abspath(part) != rep:
            new_clp_parts.append(part)
        # If Eidos is not on the CLASSPATH, add it
        if eip and os.path.abspath(part) == eip:
            has_eidos = True
    if eip and not has_eidos:
        new_clp_parts.append(eip)
    # Set the new CLASSPATH
    new_clp = ':'.join(new_clp_parts)
    os.environ['CLASSPATH'] = new_clp
_set_classpath()

from indra.java_vm import autoclass


eidos_package = 'org.clulab.wm.eidos'


[docs]class EidosReader(object): """Reader object keeping an instance of the Eidos reader as a singleton. This allows the Eidos reader to need initialization when the first piece of text is read, the subsequent readings are done with the same instance of the reader and are therefore faster. Attributes ---------- eidos_reader : org.clulab.wm.eidos.EidosSystem A Scala object, an instance of the Eidos reading system. It is instantiated only when first processing text. """ def __init__(self): self.eidos_reader = None self.default_ontology = None def get_default_ontology(self): if self.default_ontology is None: from indra_world.ontology import world_ontology self.default_ontology = world_ontology.dump_yml_str() return self.default_ontology
[docs] def initialize_reader(self): """Instantiate the Eidos reader attribute of this reader.""" eidos = autoclass(eidos_package + '.EidosSystem') self.eidos_reader = eidos()
def reground_texts(self, texts, yaml_str=None, topk=10, is_canonicalized=False, filter=True): if self.eidos_reader is None: self.initialize_reader() if yaml_str is None: yaml_str = self.get_default_ontology() text_seq = _list_to_seq(texts) raw_groundings = \ self.eidos_reader.components().ontologyHandler().reground( 'Custom', # name yaml_str, # ontologyYaml text_seq, # texts filter, # filter topk, # topk is_canonicalized # isAlreadyCanonicalized ) # Process the return values into a proper Python representation groundings = [[_get_scored_grounding(entry) for entry in text_grounding] for text_grounding in raw_groundings] return groundings
[docs] def process_text(self, text): """Return a mentions JSON object given text. Parameters ---------- text : str Text to be processed. Returns ------- json_dict : dict A JSON object of mentions extracted from text. """ if self.eidos_reader is None: self.initialize_reader() default_arg = lambda x: autoclass('scala.Some')(x) today = datetime.date.today().strftime("%Y-%m-%d") fname = 'default_file_name' annot_doc = self.eidos_reader.extractFromText( text, False, # CAG-relevant only default_arg(today), # doc creation time default_arg(fname) # file name ) # We need to get a Scala Seq of annot docs here ml = _list_to_seq([annot_doc]) # We currently do not need toinstantiate the adjective grounder # if we want to reinstate it, we would need to do the following # ag = EidosAdjectiveGrounder.fromConfig( # EidosSystem.defaultConfig.getConfig("adjectiveGrounder")) # We now create a JSON-LD corpus jc = autoclass(eidos_package + '.serialization.jsonld.JLDCorpus') corpus = jc(ml) # Finally, serialize the corpus into JSON string mentions_json = corpus.toJsonStr() json_dict = json.loads(mentions_json) return json_dict
def _list_to_seq(lst): """Return a scala.collection.Seq from a Python list.""" ml = autoclass('scala.collection.mutable.MutableList')() for element in lst: ml.appendElem(element) return ml def _get_scored_grounding(tpl): ts = tpl.toString() parts = ts[1:-1].rsplit(',', maxsplit=1) return parts[0], float(parts[1])