Source code for indra.sources.bel.processor

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
import re
import rdflib
import logging
import collections
from requests.utils import unquote
from indra.statements import *
from indra.databases import hgnc_client
from indra.util import read_unicode_csv

logger = logging.getLogger('bel')

prefixes = """
    PREFIX belvoc: <http://www.openbel.org/vocabulary/>
    PREFIX belsc: <http://www.openbel.org/bel/>
    PREFIX belns: <http://www.openbel.org/bel/namespace/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>"""

[docs]def namespace_from_uri(uri): """Return the entity namespace from the URI. Examples: http://www.openbel.org/bel/p_HGNC_RAF1 -> HGNC http://www.openbel.org/bel/p_RGD_Raf1 -> RGD http://www.openbel.org/bel/p_PFH_MEK1/2_Family -> PFH """ patterns = ['http://www.openbel.org/bel/[pragm]_([A-Za-z]+)_.*', 'http://www.openbel.org/bel/[a-z]+_[pr]_([A-Za-z]+)_.*', 'http://www.openbel.org/bel/[a-z]+_complex_([A-Za-z]+)_.*', 'http://www.openbel.org/bel/complex_([A-Za-z]+)_.*'] for pr in patterns: match = re.match(pr, uri) if match is not None: return match.groups()[0] return None
[docs]def term_from_uri(uri): """Removes prepended URI information from terms.""" if uri is None: return None # This insures that if we get a Literal with an integer value (as we # do for modification positions), it will get converted to a string, # not an integer. if isinstance(uri, rdflib.Literal): uri = str(uri.toPython()) # This is to handle URIs like # http://www.openbel.org/bel/namespace//MAPK%20Erk1/3%20Family # or # http://www.openbel.org/bel/namespace/MAPK%20Erk1/3%20Family # In the current implementation, the order of the patterns # matters. patterns = ['http://www.openbel.org/bel/namespace//(.*)', 'http://www.openbel.org/vocabulary//(.*)', 'http://www.openbel.org/bel//(.*)', 'http://www.openbel.org/bel/namespace/(.*)', 'http://www.openbel.org/vocabulary/(.*)', 'http://www.openbel.org/bel/(.*)'] for pr in patterns: match = re.match(pr, uri) if match is not None: term = match.groups()[0] term = unquote(term) return term # If none of the patterns match then the URI is actually a simple term # for instance a site: "341" or a substitution: "sub(V,600,E)" return uri
def strip_statement(uri): uri = uri.replace(r'http://www.openbel.org/bel/', '') uri = uri.replace(r'http://www.openbel.org/vocabulary/', '') return uri
[docs]class BelProcessor(object): """The BelProcessor extracts INDRA Statements from a BEL RDF model. Parameters ---------- g : rdflib.Graph An RDF graph object containing the BEL model. Attributes ---------- g : rdflib.Graph An RDF graph object containing the BEL model. statements : list[indra.statements.Statement] A list of extracted INDRA Statements representing direct mechanisms. This list should be used for assembly in INDRA. indirect_stmts : list[indra.statements.Statement] A list of extracted INDRA Statements representing indirect mechanisms. This list should be used for assembly or model checking in INDRA. converted_direct_stmts : list[str] A list of all direct BEL statements, as strings, that were converted into INDRA Statements. converted_indirect_stmts : list[str] A list of all indirect BEL statements, as strings, that were converted into INDRA Statements. degenerate_stmts : list[str] A list of degenerate BEL statements, as strings, in the BEL model. all_direct_stmts : list[str] A list of all BEL statements representing direct interactions, as strings, in the BEL model. all_indirect_stmts : list[str] A list of all BEL statements that represent indirect interactions, as strings, in the BEL model. """ def __init__(self, g): self.g = g self.statements = [] self.indirect_stmts = [] self.converted_direct_stmts = [] self.converted_indirect_stmts = [] self.degenerate_stmts = [] self.all_direct_stmts = [] self.all_indirect_stmts = []
[docs] def get_modifications(self): """Extract INDRA Modification Statements from BEL. Two SPARQL patterns are used for extracting Modifications from BEL: - q_phospho1 assumes that the subject is an AbundanceActivity, which increases/decreases a ModifiedProteinAbundance. Examples: kinaseActivity(proteinAbundance(HGNC:IKBKE)) directlyIncreases proteinAbundance(HGNC:IRF3,proteinModification(P,S,385)) phosphataseActivity(proteinAbundance(HGNC:DUSP4)) directlyDecreases proteinAbundance(HGNC:MAPK1,proteinModification(P,T,185)) - q_phospho2 assumes that the subject is a ProteinAbundance which increases/decreases a ModifiedProteinAbundance. Examples: proteinAbundance(HGNC:NGF) increases proteinAbundance(HGNC:NFKBIA,proteinModification(P,Y,42)) proteinAbundance(HGNC:FGF1) decreases proteinAbundance(HGNC:RB1,proteinModification(P)) """ # Get statements where the subject is an activity q_phospho1 = prefixes + """ SELECT ?enzName ?substrateName ?mod ?pos ?stmt ?enzyme ?substrate ?rel WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?subject . ?stmt belvoc:hasObject ?object . ?subject a belvoc:AbundanceActivity . ?subject belvoc:hasChild ?enzyme . ?enzyme a belvoc:ProteinAbundance . ?enzyme belvoc:hasConcept ?enzName . ?object a belvoc:ModifiedProteinAbundance . ?object belvoc:hasModificationType ?mod . ?object belvoc:hasChild ?substrate . ?substrate belvoc:hasConcept ?substrateName . OPTIONAL { ?object belvoc:hasModificationPosition ?pos . } } """ # Get statements where the subject is a protein abundance q_phospho2 = prefixes + """ SELECT ?enzName ?substrateName ?mod ?pos ?stmt ?enzyme ?substrate ?rel WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?enzyme . ?stmt belvoc:hasObject ?object . ?enzyme a belvoc:ProteinAbundance . ?enzyme belvoc:hasConcept ?enzName . ?object a belvoc:ModifiedProteinAbundance . ?object belvoc:hasModificationType ?mod . ?object belvoc:hasChild ?substrate . ?substrate belvoc:hasConcept ?substrateName . OPTIONAL { ?object belvoc:hasModificationPosition ?pos . } } """ for q_phospho in (q_phospho1, q_phospho2): # Run the query res_phospho = self.g.query(q_phospho) for stmt in res_phospho: # Parse out the elements of the query evidence = self._get_evidence(stmt[4]) enz = self._get_agent(stmt[0], stmt[5]) #act_type = name_from_uri(stmt[1]) sub = self._get_agent(stmt[1], stmt[6]) mod = term_from_uri(stmt[2]) residue = self._get_residue(mod) mod_pos = term_from_uri(stmt[3]) stmt_str = strip_statement(stmt[4]) # Get the relationship (increases/decreases, etc.) rel = term_from_uri(stmt[7]) if rel == 'DirectlyIncreases' or rel == 'DirectlyDecreases': is_direct = True else: is_direct = False # Build the INDRA statement # Handle PhosphorylationSerine, etc. if mod.startswith('Phosphorylation'): modtype = 'phosphorylation' else: modtype = mod.lower() # Get the class and invert if needed modclass = modtype_to_modclass[modtype] if rel == 'DirectlyDecreases' or rel == 'Decreases': modclass = modclass_to_inverse[modclass] stmt = modclass(enz, sub, residue, mod_pos, evidence) if is_direct: self.statements.append(stmt) self.converted_direct_stmts.append(stmt_str) else: self.converted_indirect_stmts.append(stmt_str) self.indirect_stmts.append(stmt) return
[docs] def get_composite_activating_mods(self): """Extract INDRA ActiveForm Statements with multiple mods from BEL. The SPARQL pattern used for extraction from BEL looks for a CompositeAbundance as subject where two constituents of the composite are both ModifiedProteinAbundances. The object has to be a Activity of a ProteinAbundance. Examples: compositeAbundance( proteinAbundance(PFH:"AKT Family",proteinModification(P,S,473)), proteinAbundance(PFH:"AKT Family",proteinModification(P,T,308))) directlyIncreases kinaseActivity(proteinAbundance(PFH:"AKT Family")) """ # To eliminate multiple matches, we use pos1 < pos2 but this will # only work if the pos is given, otherwise multiple matches of # the same mod combination may appear in the result q_mods = prefixes + """ SELECT ?speciesName ?actType ?mod1 ?pos1 ?mod2 ?pos2 ?rel ?stmt ?species WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?subject . ?stmt belvoc:hasObject ?object . ?object belvoc:hasActivityType ?actType . ?object belvoc:hasChild ?species . ?species a belvoc:ProteinAbundance . ?species belvoc:hasConcept ?speciesName . ?subject a belvoc:CompositeAbundance . ?subject belvoc:hasChild ?subject1 . ?subject1 a belvoc:ModifiedProteinAbundance . ?subject1 belvoc:hasModificationType ?mod1 . ?subject1 belvoc:hasChild ?species . ?subject belvoc:hasChild ?subject2 . ?subject2 a belvoc:ModifiedProteinAbundance . ?subject2 belvoc:hasModificationType ?mod2 . ?subject2 belvoc:hasChild ?species . OPTIONAL { ?subject1 belvoc:hasModificationPosition ?pos1 . } OPTIONAL { ?subject2 belvoc:hasModificationPosition ?pos2 . } FILTER ((?rel = belvoc:DirectlyIncreases || ?rel = belvoc:DirectlyDecreases) && ?pos1 < ?pos2) } """ # Now make the PySB for the phosphorylation res_mods = self.g.query(q_mods) for stmt in res_mods: evidence = self._get_evidence(stmt[7]) # Parse out the elements of the query species = self._get_agent(stmt[0], stmt[8]) act_type = term_from_uri(stmt[1]).lower() mod1 = term_from_uri(stmt[2]) mod_pos1 = term_from_uri(stmt[3]) mc1 = self._get_mod_condition(mod1, mod_pos1) mod2 = term_from_uri(stmt[4]) mod_pos2 = term_from_uri(stmt[5]) mc2 = self._get_mod_condition(mod2, mod_pos2) species.mods = [mc1, mc2] rel = term_from_uri(stmt[6]) if rel == 'DirectlyDecreases': is_active = False else: is_active = True stmt_str = strip_statement(stmt[7]) # Mark this as a converted statement self.converted_direct_stmts.append(stmt_str) st = ActiveForm(species, act_type, is_active, evidence) self.statements.append(st)
[docs] def get_activating_mods(self): """Extract INDRA ActiveForm Statements with a single mod from BEL. The SPARQL pattern used for extraction from BEL looks for a ModifiedProteinAbundance as subject and an Activiy of a ProteinAbundance as object. Examples: proteinAbundance(HGNC:INSR,proteinModification(P,Y)) directlyIncreases kinaseActivity(proteinAbundance(HGNC:INSR)) """ q_mods = prefixes + """ SELECT ?speciesName ?actType ?mod ?pos ?rel ?stmt ?species WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?subject . ?stmt belvoc:hasObject ?object . ?object belvoc:hasActivityType ?actType . ?object belvoc:hasChild ?species . ?species a belvoc:ProteinAbundance . ?species belvoc:hasConcept ?speciesName . ?subject a belvoc:ModifiedProteinAbundance . ?subject belvoc:hasModificationType ?mod . ?subject belvoc:hasChild ?species . OPTIONAL { ?subject belvoc:hasModificationPosition ?pos . } FILTER (?rel = belvoc:DirectlyIncreases || ?rel = belvoc:DirectlyDecreases) } """ # Now make the PySB for the phosphorylation res_mods = self.g.query(q_mods) for stmt in res_mods: evidence = self._get_evidence(stmt[5]) # Parse out the elements of the query species = self._get_agent(stmt[0], stmt[6]) act_type = term_from_uri(stmt[1]).lower() mod = term_from_uri(stmt[2]) mod_pos = term_from_uri(stmt[3]) mc = self._get_mod_condition(mod, mod_pos) species.mods = [mc] rel = term_from_uri(stmt[4]) if rel == 'DirectlyDecreases': is_active = False else: is_active = True stmt_str = strip_statement(stmt[5]) # Mark this as a converted statement self.converted_direct_stmts.append(stmt_str) st = ActiveForm(species, act_type, is_active, evidence) self.statements.append(st)
[docs] def get_complexes(self): """Extract INDRA Complex Statements from BEL. The SPARQL query used to extract Complexes looks for ComplexAbundance terms and their constituents. This pattern is distinct from other patterns in this processor in that it queries for terms, not full statements. Examples: complexAbundance(proteinAbundance(HGNC:PPARG), proteinAbundance(HGNC:RXRA)) decreases biologicalProcess(MESHPP:"Insulin Resistance") """ q_cmplx = prefixes + """ SELECT ?complexTerm ?childName ?child ?stmt WHERE { { {?stmt belvoc:hasSubject ?complexTerm} UNION {?stmt belvoc:hasObject ?complexTerm .} UNION {?stmt belvoc:hasSubject ?term . ?term belvoc:hasChild ?complexTerm .} UNION {?stmt belvoc:hasObject ?term . ?term belvoc:hasChild ?complexTerm .} } ?complexTerm a belvoc:Term . ?complexTerm a belvoc:ComplexAbundance . ?complexTerm belvoc:hasChild ?child . ?child belvoc:hasConcept ?childName . } """ # Run the query res_cmplx = self.g.query(q_cmplx) # Store the members of each complex in a dict of lists, keyed by the # term for the complex cmplx_dict = collections.defaultdict(list) cmplx_ev = {} for stmt in res_cmplx: stmt_uri = stmt[3] ev = self._get_evidence(stmt_uri) for e in ev: e.epistemics['direct'] = True cmplx_name = term_from_uri(stmt[0]) cmplx_id = stmt_uri + '#' + cmplx_name child = self._get_agent(stmt[1], stmt[2]) cmplx_dict[cmplx_id].append(child) # This might be written multiple times but with the same # evidence cmplx_ev[cmplx_id] = ev # Now iterate over the stored complex information and create binding # statements for cmplx_id, cmplx_list in cmplx_dict.items(): if len(cmplx_list) < 2: msg = 'Complex %s has less than 2 members! Skipping.' % \ cmplx_name logger.warning(msg) else: self.statements.append(Complex(cmplx_list, evidence=cmplx_ev[cmplx_id]))
[docs] def get_activating_subs(self): """Extract INDRA ActiveForm Statements based on a mutation from BEL. The SPARQL pattern used to extract ActiveForms due to mutations look for a ProteinAbundance as a subject which has a child encoding the amino acid substitution. The object of the statement is an ActivityType of the same ProteinAbundance, which is either increased or decreased. Examples: proteinAbundance(HGNC:NRAS,substitution(Q,61,K)) directlyIncreases gtpBoundActivity(proteinAbundance(HGNC:NRAS)) proteinAbundance(HGNC:TP53,substitution(F,134,I)) directlyDecreases transcriptionalActivity(proteinAbundance(HGNC:TP53)) """ q_mods = prefixes + """ SELECT ?enzyme_name ?sub_label ?act_type ?rel ?stmt ?subject WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?subject . ?stmt belvoc:hasObject ?object . ?subject a belvoc:ProteinAbundance . ?subject belvoc:hasConcept ?enzyme_name . ?subject belvoc:hasChild ?sub_expr . ?sub_expr rdfs:label ?sub_label . ?object a belvoc:AbundanceActivity . ?object belvoc:hasActivityType ?act_type . ?object belvoc:hasChild ?enzyme . ?enzyme a belvoc:ProteinAbundance . ?enzyme belvoc:hasConcept ?enzyme_name . } """ # Now make the PySB for the phosphorylation res_mods = self.g.query(q_mods) for stmt in res_mods: evidence = self._get_evidence(stmt[4]) # Parse out the elements of the query enz = self._get_agent(stmt[0], stmt[5]) sub_expr = term_from_uri(stmt[1]) act_type = term_from_uri(stmt[2]).lower() # Parse the WT and substituted residues from the node label. # Strangely, the RDF for substituted residue doesn't break the # terms of the BEL expression down into their meaning, as happens # for modified protein abundances. Instead, the substitution # just comes back as a string, e.g., "sub(V,600,E)". This code # parses the arguments back out using a regular expression. match = re.match('sub\(([A-Z]),([0-9]*),([A-Z])\)', sub_expr) if match: matches = match.groups() wt_residue = matches[0] position = matches[1] sub_residue = matches[2] else: logger.warning("Could not parse substitution expression %s" % sub_expr) continue mc = MutCondition(position, wt_residue, sub_residue) enz.mutations = [mc] rel = strip_statement(stmt[3]) if rel == 'DirectlyDecreases': is_active = False else: is_active = True stmt_str = strip_statement(stmt[4]) # Mark this as a converted statement self.converted_direct_stmts.append(stmt_str) st = ActiveForm(enz, act_type, is_active, evidence) self.statements.append(st)
[docs] def get_activation(self): """Extract INDRA Inhibition/Activation Statements from BEL. The SPARQL query used to extract Activation Statements looks for patterns in which the subject is is an ActivityType (of a ProtainAbundance) or an Abundance (of a small molecule). The object has to be the ActivityType (typically of a ProteinAbundance) which is either increased or decreased. Examples: abundance(CHEBI:gefitinib) directlyDecreases kinaseActivity(proteinAbundance(HGNC:EGFR)) kinaseActivity(proteinAbundance(HGNC:MAP3K5)) directlyIncreases kinaseActivity(proteinAbundance(HGNC:MAP2K7)) This pattern covers the extraction of Gap/Gef and GtpActivation Statements, which are recognized by the object activty or the subject activity, respectively, being `gtpbound`. Examples: catalyticActivity(proteinAbundance(HGNC:RASA1)) directlyDecreases gtpBoundActivity(proteinAbundance(PFH:"RAS Family")) catalyticActivity(proteinAbundance(HGNC:SOS1)) directlyIncreases gtpBoundActivity(proteinAbundance(HGNC:HRAS)) gtpBoundActivity(proteinAbundance(HGNC:HRAS)) directlyIncreases catalyticActivity(proteinAbundance(HGNC:TIAM1)) """ q_stmts = prefixes + """ SELECT ?subjName ?subjActType ?rel ?objName ?objActType ?stmt ?subj ?obj WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?subj . {?subj belvoc:hasActivityType ?subjActType . ?subj belvoc:hasChild ?subjProt . ?subjProt belvoc:hasConcept ?subjName .} UNION {?subj a belvoc:Abundance . ?subj belvoc:hasConcept ?subjName .} ?stmt belvoc:hasObject ?obj . ?obj belvoc:hasActivityType ?objActType . ?obj belvoc:hasChild ?objProt . ?objProt belvoc:hasConcept ?objName . FILTER (?rel = belvoc:DirectlyIncreases || ?rel = belvoc:DirectlyDecreases) } """ res_stmts = self.g.query(q_stmts) for stmt in res_stmts: evidence = self._get_evidence(stmt[5]) subj = self._get_agent(stmt[0], stmt[6]) subj_activity = stmt[1] if subj_activity: subj_activity = term_from_uri(stmt[1]).lower() subj.activity = ActivityCondition(subj_activity, True) rel = term_from_uri(stmt[2]) if rel == 'DirectlyDecreases': is_activation = False else: is_activation = True obj = self._get_agent(stmt[3], stmt[7]) obj_activity = term_from_uri(stmt[4]).lower() stmt_str = strip_statement(stmt[5]) # Mark this as a converted statement self.converted_direct_stmts.append(stmt_str) # Distinguish the case when the activator is a GTPase # (since this may involve unique and stereotyped mechanisms) if subj_activity == 'gtpbound': if not is_activation: logger.warning('GtpActivation only handles positive ' 'activation.') continue self.statements.append( GtpActivation(subj, obj, obj_activity, evidence)) # If the object is a GTPase, and the subject *increases* # its GtpBound activity, then the subject is a GEF elif obj_activity == 'gtpbound' and rel == 'DirectlyIncreases': self.statements.append( Gef(subj, obj, evidence)) # If the object is a GTPase, and the subject *decreases* # its GtpBound activity, then the subject is a GAP elif obj_activity == 'gtpbound' and rel == 'DirectlyDecreases': self.statements.append( Gap(subj, obj, evidence)) # Otherwise, create a generic Activity->Activity statement else: if rel == 'DirectlyDecreases': st = Inhibition(subj, obj, obj_activity, evidence) else: st = Activation(subj, obj, obj_activity, evidence) self.statements.append(st)
[docs] def get_transcription(self): """Extract Increase/DecreaseAmount INDRA Statements from BEL. Three distinct SPARQL patterns are used to extract amount regulations from BEL. - q_tscript1 searches for a subject which is a Transcription ActivityType of a ProteinAbundance and an object which is an RNAAbundance that is either increased or decreased. Examples: transcriptionalActivity(proteinAbundance(HGNC:FOXP2)) directlyIncreases rnaAbundance(HGNC:SYK) transcriptionalActivity(proteinAbundance(HGNC:FOXP2)) directlyDecreases rnaAbundance(HGNC:CALCRL) - q_tscript2 searches for a subject which is a ProteinAbundance and an object which is an RNAAbundance. Note that this pattern typically exists in an indirect form (i.e. increases/decreases). Example: proteinAbundance(HGNC:MTF1) directlyIncreases rnaAbundance(HGNC:LCN1) - q_tscript3 searches for a subject which is a ModifiedProteinAbundance, with an object which is an RNAAbundance. In the BEL large corpus, this pattern is found for subjects which are protein families or mouse/rat proteins, and the predicate in an indirect increase. Example: proteinAbundance(PFR:"Akt Family",proteinModification(P)) increases rnaAbundance(RGD:Cald1) """ q_tscript1 = prefixes + """ SELECT ?tfName ?targetName ?stmt ?tf ?target ?rel WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?subject . ?stmt belvoc:hasObject ?target . ?subject a belvoc:AbundanceActivity . ?subject belvoc:hasActivityType belvoc:Transcription . ?subject belvoc:hasChild ?tf . ?tf a belvoc:ProteinAbundance . ?tf belvoc:hasConcept ?tfName . ?target a belvoc:RNAAbundance . ?target belvoc:hasConcept ?targetName . } """ q_tscript2 = prefixes + """ SELECT ?tfName ?targetName ?stmt ?tf ?target ?rel WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?tf . ?stmt belvoc:hasObject ?target . ?tf a belvoc:ProteinAbundance . ?tf belvoc:hasConcept ?tfName . ?target a belvoc:RNAAbundance . ?target belvoc:hasConcept ?targetName . } """ q_tscript3 = prefixes + """ SELECT ?tfName ?targetName ?stmt ?tf ?target ?rel ?mod ?pos WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?subject . ?stmt belvoc:hasObject ?target . ?subject a belvoc:ModifiedProteinAbundance . ?subject belvoc:hasModificationType ?mod . ?subject belvoc:hasChild ?tf . ?tf belvoc:hasConcept ?tfName . ?target a belvoc:RNAAbundance . ?target belvoc:hasConcept ?targetName . OPTIONAL { ?subject belvoc:hasModificationPosition ?pos . } } """ for q_tscript in (q_tscript1, q_tscript2, q_tscript3): res_tscript = self.g.query(q_tscript) for stmt in res_tscript: # Get modifications on the subject, if any if q_tscript == q_tscript1: tf = self._get_agent(stmt[0], stmt[3]) tf.activity = ActivityCondition('transcription', True) elif q_tscript == q_tscript3: mod = term_from_uri(stmt[6]) mod_pos = term_from_uri(stmt[7]) mc = self._get_mod_condition(mod, mod_pos) if mc is None: continue tf = self._get_agent(stmt[0], stmt[3]) tf.mods = mods=[mc] else: tf = self._get_agent(stmt[0], stmt[3]) # Parse out the elements of the query evidence = self._get_evidence(stmt[2]) target = self._get_agent(stmt[1], stmt[4]) stmt_str = strip_statement(stmt[2]) # Get the relationship (increases/decreases, etc.) rel = term_from_uri(stmt[5]) if rel == 'DirectlyIncreases' or rel == 'DirectlyDecreases': is_direct = True else: is_direct = False # Build the INDRA statement stmt = None if rel == 'DirectlyIncreases' or rel == 'Increases': stmt = IncreaseAmount(tf, target, evidence) elif rel == 'DirectlyDecreases' or rel == 'Decreases': stmt = DecreaseAmount(tf, target, evidence) # If we've matched a pattern, mark this as a converted statement if stmt is not None: if is_direct: self.statements.append(stmt) self.converted_direct_stmts.append(stmt_str) else: self.indirect_stmts.append(stmt) self.converted_indirect_stmts.append(stmt_str)
[docs] def get_conversions(self): """Extract Conversion INDRA Statements from BEL. The SPARQL query used to extract Conversions searches for a subject (controller) which is an AbundanceActivity which directlyIncreases a Reaction with a given list of Reactants and Products. Examples: catalyticActivity(proteinAbundance(HGNC:HMOX1)) directlyIncreases reaction(reactants(abundance(CHEBI:heme)), products(abundance(SCHEM:Biliverdine), abundance(CHEBI:"carbon monoxide"))) """ query = prefixes + """ SELECT DISTINCT ?controller ?controllerName ?controllerActivity ?product ?productName ?reactant ?reactantName ?stmt WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasRelationship ?rel . ?stmt belvoc:hasSubject ?subject . ?stmt belvoc:hasObject ?rxn . ?subject a belvoc:AbundanceActivity . ?subject belvoc:hasActivityType ?controllerActivity . ?subject belvoc:hasChild ?controller . ?controller belvoc:hasConcept ?controllerName . ?rxn a belvoc:Reaction . ?rxn belvoc:hasChild ?reactants . ?reactants rdfs:label ?reactLabel . FILTER (regex(?reactLabel, "^reactants.*")) ?rxn belvoc:hasChild ?products . ?products rdfs:label ?prodLabel . FILTER (regex(?prodLabel, "^products.*")) ?reactants belvoc:hasChild ?reactant . ?products belvoc:hasChild ?product . ?reactant belvoc:hasConcept ?reactantName . ?product belvoc:hasConcept ?productName . } """ res = self.g.query(query) # We need to collect all pieces of the same statement so that we can # collect multiple reactants and products stmt_map = collections.defaultdict(list) for stmt in res: stmt_map[stmt[-1]].append(stmt) for stmts in stmt_map.values(): # First we get the shared part of the Statement stmt = stmts[0] subj = self._get_agent(stmt[1], stmt[0]) evidence = self._get_evidence(stmt[-1]) stmt_str = strip_statement(stmt[-1]) # Now we collect the participants obj_from_map = {} obj_to_map = {} for stmt in stmts: reactant_name = stmt[6] product_name = stmt[4] if reactant_name not in obj_from_map: obj_from_map[reactant_name] = \ self._get_agent(stmt[6], stmt[5]) if product_name not in obj_to_map: obj_to_map[product_name] = \ self._get_agent(stmt[4], stmt[3]) obj_from = list(obj_from_map.values()) obj_to = list(obj_to_map.values()) st = Conversion(subj, obj_from, obj_to, evidence=evidence) # If we've matched a pattern, mark this as a converted statement self.statements.append(st) self.converted_direct_stmts.append(stmt_str)
[docs] def get_all_direct_statements(self): """Get all directlyIncreases/Decreases BEL statements. This method stores the results of the query in self.all_direct_stmts as a list of strings. The SPARQL query used to find direct BEL statements searches for all statements whose predicate is either DirectyIncreases or DirectlyDecreases. """ logger.info("Getting all direct statements...\n") q_stmts = prefixes + """ SELECT ?stmt WHERE { ?stmt a belvoc:Statement . { { ?stmt belvoc:hasRelationship belvoc:DirectlyIncreases . } UNION { ?stmt belvoc:hasRelationship belvoc:DirectlyDecreases . } } } """ res_stmts = self.g.query(q_stmts) self.all_direct_stmts = [strip_statement(stmt[0]) for stmt in res_stmts]
[docs] def get_all_indirect_statements(self): """Get all indirect increases/decreases BEL statements. This method stores the results of the query in self.all_indirect_stmts as a list of strings. The SPARQL query used to find indirect BEL statements searches for all statements whose predicate is either Increases or Decreases. """ q_stmts = prefixes + """ SELECT ?stmt WHERE { ?stmt a belvoc:Statement . { { ?stmt belvoc:hasRelationship belvoc:Increases . } UNION { ?stmt belvoc:hasRelationship belvoc:Decreases . } } } """ res_stmts = self.g.query(q_stmts) self.all_indirect_stmts = [strip_statement(stmt[0]) for stmt in res_stmts]
[docs] def get_degenerate_statements(self): """Get all degenerate BEL statements. Stores the results of the query in self.degenerate_stmts. """ logger.info("Checking for 'degenerate' statements...\n") # Get rules of type protein X -> activity Y q_stmts = prefixes + """ SELECT ?stmt WHERE { ?stmt a belvoc:Statement . ?stmt belvoc:hasSubject ?subj . ?stmt belvoc:hasObject ?obj . { { ?stmt belvoc:hasRelationship belvoc:DirectlyIncreases . } UNION { ?stmt belvoc:hasRelationship belvoc:DirectlyDecreases . } } { { ?subj a belvoc:ProteinAbundance . } UNION { ?subj a belvoc:ModifiedProteinAbundance . } } ?subj belvoc:hasConcept ?xName . { { ?obj a belvoc:ProteinAbundance . ?obj belvoc:hasConcept ?yName . } UNION { ?obj a belvoc:ModifiedProteinAbundance . ?obj belvoc:hasChild ?proteinY . ?proteinY belvoc:hasConcept ?yName . } UNION { ?obj a belvoc:AbundanceActivity . ?obj belvoc:hasChild ?objChild . ?objChild a belvoc:ProteinAbundance . ?objChild belvoc:hasConcept ?yName . } } FILTER (?xName != ?yName) } """ res_stmts = self.g.query(q_stmts) logger.info("Protein -> Protein/Activity statements:") logger.info("---------------------------------------") for stmt in res_stmts: stmt_str = strip_statement(stmt[0]) logger.info(stmt_str) self.degenerate_stmts.append(stmt_str)
[docs] def print_statement_coverage(self): """Display how many of the direct statements have been converted. Also prints how many are considered 'degenerate' and not converted.""" if not self.all_direct_stmts: self.get_all_direct_statements() if not self.degenerate_stmts: self.get_degenerate_statements() if not self.all_indirect_stmts: self.get_all_indirect_statements() logger.info('') logger.info("Total indirect statements: %d" % len(self.all_indirect_stmts)) logger.info("Converted indirect statements: %d" % len(self.converted_indirect_stmts)) logger.info(">> Unhandled indirect statements: %d" % (len(self.all_indirect_stmts) - len(self.converted_indirect_stmts))) logger.info('') logger.info("Total direct statements: %d" % len(self.all_direct_stmts)) logger.info("Converted direct statements: %d" % len(self.converted_direct_stmts)) logger.info("Degenerate direct statements: %d" % len(self.degenerate_stmts)) logger.info(">> Unhandled direct statements: %d" % (len(self.all_direct_stmts) - len(self.converted_direct_stmts) - len(self.degenerate_stmts))) logger.info('') logger.info("--- Unhandled direct statements ---------") for stmt in self.all_direct_stmts: if not (stmt in self.converted_direct_stmts or stmt in self.degenerate_stmts): logger.info(stmt) logger.info('') logger.info("--- Unhandled indirect statements ---------") for stmt in self.all_indirect_stmts: if not (stmt in self.converted_indirect_stmts or stmt in self.degenerate_stmts): logger.info(stmt)
[docs] def print_statements(self): """Print all extracted INDRA Statements.""" logger.info('--- Direct INDRA statements ----------') for i, stmt in enumerate(self.statements): logger.info("%s: %s" % (i, stmt)) logger.info('--- Indirect INDRA statements ----------') for i, stmt in enumerate(self.indirect_stmts): logger.info("%s: %s" % (i, stmt))
@staticmethod def _get_agent(concept, entity): name = term_from_uri(concept) namespace = namespace_from_uri(entity) db_refs = {} if namespace == 'HGNC': agent_name = name hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning("Couldn't get HGNC ID for HGNC symbol %s" % name) elif namespace in ('MGI', 'RGD'): agent_name = name db_refs[namespace] = name elif namespace in ('PFH', 'SFAM'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL family: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace in ('NCH', 'SCOMP'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL complex: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs['CHEBI'] = chebi_id else: logger.warning('CHEBI name %s not found in map.' % name) agent_name = name elif namespace == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs['EGID'] = name if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) agent_name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning('Could not map EGID%s to HGNC.' % name) agent_name = 'E%s' % name else: logger.warning('Unhandled entity namespace: %s' % namespace) print('%s, %s' % (concept, entity)) agent_name = name agent = Agent(agent_name, db_refs=db_refs) return agent def _get_evidence(self, statement): evidence = None citation = None annotations = [] # Query for all annotations of the statement q_annotations = prefixes + """ SELECT ?annotation WHERE { <%s> belvoc:hasEvidence ?evidence . ?evidence belvoc:hasAnnotation ?annotation . } """ % statement.format() res_annotations = self.g.query(q_annotations) for stmt in res_annotations: annotations.append(stmt[0].format()) # Query for evidence text and citation q_evidence = prefixes + """ SELECT ?evidenceText ?citation WHERE { <%s> belvoc:hasEvidence ?evidence . ?evidence belvoc:hasEvidenceText ?evidenceText . ?evidence belvoc:hasCitation ?citation . } """ % statement.format() res_evidence = self.g.query(q_evidence) # Query for directness q_direct = prefixes + """ SELECT ?predicate WHERE { <%s> belvoc:hasRelationship ?predicate . } """ % statement.format() res_direct = self.g.query(q_direct) epistemics = {} if res_direct: rel = term_from_uri(list(res_direct)[0][0]) if rel in ('DirectlyDecreases', 'DirectlyIncreases'): epistemics['direct'] = True if rel in ('Decreases', 'Increases'): epistemics['direct'] = False evs = [] for stmt in res_evidence: text = stmt[0].toPython() citation = stmt[1].toPython() if citation is not None: m = re.match('.*pubmed:([0-9]+)', citation) if m is not None: citation = m.groups()[0] ev = Evidence(source_api='bel', source_id=statement, pmid=citation, text=text, annotations=annotations, epistemics=epistemics) evs.append(ev) else: logger.warning('Could not parse citation: %s' % citation) if not evs: evs = [Evidence(source_api='bel', source_id=statement, annotations=annotations, epistemics=epistemics)] return evs @staticmethod def _get_residue(mod): if mod.startswith('Phosphorylation'): if mod == 'Phosphorylation': residue = None else: residue = mod[15:].lower() residue = get_valid_residue(residue) else: residue = None return residue @staticmethod def _get_mod_condition(mod, mod_pos): if not mod: return None if mod.startswith('Phosphorylation'): mc = ModCondition('phosphorylation') else: mc = ModCondition(mod.lower()) mc.residue = BelProcessor._get_residue(mod) mc.position = mod_pos return mc
def _build_bioentities_map(): fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../resources/bioentities_map.tsv') bel_to_indra = {} csv_rows = read_unicode_csv(fname, delimiter='\t') for row in csv_rows: namespace = row[0] entry = row[1] indra_name = row[2] if namespace == 'BEL': bel_to_indra[entry] = indra_name return bel_to_indra def _build_chebi_map(): fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../resources/bel_chebi_map.tsv') chebi_name_id = {} csv_rows = read_unicode_csv(fname, delimiter='\t') for row in csv_rows: chebi_name = row[0] chebi_id = row[1] chebi_name_id[chebi_name] = chebi_id return chebi_name_id bel_to_indra = _build_bioentities_map() chebi_name_id = _build_chebi_map()