Source code for indra.sources.eidos.reader

import os
import json
import datetime
from indra import get_config


# Before the import, we have to deal with the CLASSPATH to avoid clashes
# with REACH.
def _set_classpath():
    clp = os.environ.get('CLASSPATH')
    eip = get_config('EIDOSPATH')
    rep = get_config('REACHPATH')
    clp_parts = clp.split(':') if clp else []
    new_clp_parts = []
    has_eidos = False
    # Look at all the parts of the CLASSPATH
    for part in clp_parts:
        # If REACH is on the CLASSPATH, remove it
        if not rep or os.path.abspath(part) != rep:
            new_clp_parts.append(part)
        # If Eidos is not on the CLASSPATH, add it
        if eip and os.path.abspath(part) == eip:
            has_eidos = True
    if eip and not has_eidos:
        new_clp_parts.append(eip)
    # Set the new CLASSPATH
    new_clp = ':'.join(new_clp_parts)
    os.environ['CLASSPATH'] = new_clp
_set_classpath()

from indra.java_vm import autoclass


eidos_package = 'org.clulab.wm.eidos'



[docs]
class EidosReader(object):
    """Reader object keeping an instance of the Eidos reader as a singleton.

    This allows the Eidos reader to need initialization when the first piece of
    text is read, the subsequent readings are done with the same
    instance of the reader and are therefore faster.

    Attributes
    ----------
    eidos_reader : org.clulab.wm.eidos.EidosSystem
        A Scala object, an instance of the Eidos reading system. It is
        instantiated only when first processing text.
    """

    def __init__(self):
        self.eidos_reader = None
        self.default_ontology = None

    def get_default_ontology(self):
        if self.default_ontology is None:
            from indra_world.ontology import world_ontology
            self.default_ontology = world_ontology.dump_yml_str()
        return self.default_ontology


[docs]
    def initialize_reader(self):
        """Instantiate the Eidos reader attribute of this reader."""
        eidos = autoclass(eidos_package + '.EidosSystem')
        self.eidos_reader = eidos()


    def reground_texts(self, texts, yaml_str=None, topk=10,
                       is_canonicalized=False, filter=True):
        if self.eidos_reader is None:
            self.initialize_reader()
        if yaml_str is None:
            yaml_str = self.get_default_ontology()
        text_seq = _list_to_seq(texts)
        raw_groundings = \
            self.eidos_reader.components().ontologyHandler().reground(
                'Custom',  # name
                yaml_str,  # ontologyYaml
                text_seq,  # texts
                filter,  # filter
                topk,  # topk
                is_canonicalized  # isAlreadyCanonicalized
            )
        # Process the return values into a proper Python representation
        groundings = [[_get_scored_grounding(entry) for entry in text_grounding]
                      for text_grounding in raw_groundings]
        return groundings


[docs]
    def process_text(self, text):
        """Return a mentions JSON object given text.

        Parameters
        ----------
        text : str
            Text to be processed.

        Returns
        -------
        json_dict : dict
            A JSON object of mentions extracted from text.
        """
        if self.eidos_reader is None:
            self.initialize_reader()
        default_arg = lambda x: autoclass('scala.Some')(x)
        today = datetime.date.today().strftime("%Y-%m-%d")
        fname = 'default_file_name'

        annot_doc = self.eidos_reader.extractFromText(
            text,
            False,  # CAG-relevant only
            default_arg(today),  # doc creation time
            default_arg(fname)  # file name
            )
        # We need to get a Scala Seq of annot docs here
        ml = _list_to_seq([annot_doc])
        # We currently do not need toinstantiate the adjective grounder
        # if we want to reinstate it, we would need to do the following
        # ag = EidosAdjectiveGrounder.fromConfig(
        #   EidosSystem.defaultConfig.getConfig("adjectiveGrounder"))
        # We now create a JSON-LD corpus
        jc = autoclass(eidos_package + '.serialization.jsonld.JLDCorpus')
        corpus = jc(ml)
        # Finally, serialize the corpus into JSON string
        mentions_json = corpus.toJsonStr()
        json_dict = json.loads(mentions_json)
        return json_dict




def _list_to_seq(lst):
    """Return a scala.collection.Seq from a Python list."""
    ml = autoclass('scala.collection.mutable.MutableList')()
    for element in lst:
        ml.appendElem(element)
    return ml


def _get_scored_grounding(tpl):
    ts = tpl.toString()
    parts = ts[1:-1].rsplit(',', maxsplit=1)
    return parts[0], float(parts[1])