Source code for indra.sources.geneways.processor

"""
This module provides an input processor for information extracted using the
Geneways software suite, converting extraction data in Geneways format into
INDRA statements.

See publication:
Rzhetsky, Andrey, Ivan Iossifov, Tomohiro Koike, Michael Krauthammer, Pauline
Kra, Mitzi Morris, Hong Yu et al. "GeneWays: a system for extracting,
analyzing, visualizing, and integrating molecular pathway data."
Journal of biomedical informatics 37, no. 1 (2004): 43-53.
"""

from indra.statements import Evidence, Agent
import indra.databases.hgnc_client as hgc
from indra.literature import *
from indra.statements import Complex, Phosphorylation
from indra.ontology.standardize import \
    standardize_agent_name
from indra.sources.geneways.action_parser import GenewaysActionParser
try:
    from indra.sources.geneways.find_full_text_sentence import FullTextMention
    get_ft_mention = True
except ImportError:
    logger.error('Install the nltk and stemming packages to extract full '
                 'text evidence for Geneways mentions.')
    get_ft_mention = False


logger = logging.getLogger(__name__)


# This will take in an action and action mention and create a single statement
[docs]class GenewaysProcessor(object): """The GenewaysProcessors converts extracted Geneways action mentions into INDRA statements. Parameters ---------- search_path : list[str] A list of directories in which to search for Geneways data Attributes ---------- statements : list[indra.statements.Statement] A list of INDRA statements converted from Geneways action mentions, populated by calling the constructor """ def __init__(self, search_path, get_evidence=True): if get_evidence and get_ft_mention: self.get_ft_mention = True else: self.get_ft_mention = False # Parse Geneways data. Will give an error if it can't find # the Geneways data logger.info('Loading Geneways extractions') parser = GenewaysActionParser(search_path) logger.info('Geneways extractions loaded') actions = parser.actions # Make a list of statements from the actions self.statements = [] for action in actions: for mention in action.action_mentions: if mention.negative != '1': new_statement = self.make_statement(action, mention) if new_statement is not None: self.statements.append(new_statement)
[docs] def make_statement(self, action, mention): """Makes an INDRA statement from a Geneways action and action mention. Parameters ---------- action : GenewaysAction The mechanism that the Geneways mention maps to. Note that several text mentions can correspond to the same action if they are referring to the same relationship - there may be multiple Geneways action mentions corresponding to each action. mention : GenewaysActionMention The Geneways action mention object corresponding to a single mention of a mechanism in a specific text. We make a new INDRA statement corresponding to each action mention. Returns ------- statement : indra.statements.Statement An INDRA statement corresponding to the provided Geneways action mention, or None if the action mention's type does not map onto any INDRA statement type in geneways_action_type_mapper. """ (statement_generator, is_direct) = \ geneways_action_to_indra_statement_type(mention.actiontype, action.plo) if statement_generator is None: # Geneways statement does not map onto an indra statement return None # Try to find the full-text sentence # Unfortunately, the sentence numbers in the Geneways dataset # don't correspond to an obvious sentence segmentation. # This code looks for sentences with the subject, object, and verb # listed by the Geneways action mention table and only includes # it in the evidence if there is exactly one such sentence text = None if self.get_ft_mention: try: content, content_type = get_full_text(mention.pmid, 'pmid') if content is not None: ftm = FullTextMention(mention, content) sentences = ftm.find_matching_sentences() if len(sentences) == 1: text = sentences[0] except Exception: logger.warning('Could not fetch full text for PMID ' + mention.pmid) # Make an evidence object epistemics = dict() epistemics['direct'] = is_direct annotations = mention.make_annotation() annotations['plo'] = action.plo # plo only in action table evidence = Evidence(source_api='geneways', source_id=mention.actionmentionid, pmid=mention.pmid, text=text, epistemics=epistemics, annotations=annotations) # Construct the grounded and name standardized agents # Note that this involves grounding the agent by # converting the Entrez ID listed in the Geneways data with # HGNC and UniProt upstream_agent = get_agent(mention.upstream, action.up) downstream_agent = get_agent(mention.downstream, action.dn) # Make the statement return statement_generator(upstream_agent, downstream_agent, evidence)
def get_agent(raw_name, entrez_id): db_refs = {'TEXT': raw_name, 'EGID': entrez_id} logger.debug('Looking up grounding data for Entrez #%s' % entrez_id) hgnc_id = hgc.get_hgnc_from_entrez(entrez_id) if hgnc_id: db_refs['HGNC'] = hgnc_id agent = Agent(raw_name, db_refs=db_refs) standardize_agent_name(agent, standardize_refs=True) return agent
[docs]def geneways_action_to_indra_statement_type(actiontype, plo): """Return INDRA Statement corresponding to Geneways action type. Parameters ---------- actiontype : str The verb extracted by the Geneways processor plo : str A one character string designating whether Geneways classifies this verb as a physical, logical, or other interaction Returns ------- statement_generator : If there is no mapping to INDRA statements from this action type the return value is None. If there is such a mapping, statement_generator is an anonymous function that takes in the subject agent, object agent, and evidence, in that order, and returns an INDRA statement object. """ actiontype = actiontype.lower() statement_generator = None is_direct = (plo == 'P') if actiontype == 'bind': statement_generator = lambda substance1, substance2, evidence: \ Complex([substance1, substance2], evidence=evidence) is_direct = True elif actiontype == 'phosphorylate': statement_generator = lambda substance1, substance2, evidence: \ Phosphorylation(substance1, substance2, evidence=evidence) is_direct = True return (statement_generator, is_direct)