from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
import re
import rdflib
import logging
import collections
from requests.utils import unquote
from indra.statements import *
from indra.databases import hgnc_client
from indra.util import read_unicode_csv
logger = logging.getLogger('bel')
prefixes = """
PREFIX belvoc: <http://www.openbel.org/vocabulary/>
PREFIX belsc: <http://www.openbel.org/bel/>
PREFIX belns: <http://www.openbel.org/bel/namespace/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>"""
[docs]def namespace_from_uri(uri):
"""Return the entity namespace from the URI. Examples:
http://www.openbel.org/bel/p_HGNC_RAF1 -> HGNC
http://www.openbel.org/bel/p_RGD_Raf1 -> RGD
http://www.openbel.org/bel/p_PFH_MEK1/2_Family -> PFH
"""
patterns = ['http://www.openbel.org/bel/[pragm]_([A-Za-z]+)_.*',
'http://www.openbel.org/bel/[a-z]+_[pr]_([A-Za-z]+)_.*',
'http://www.openbel.org/bel/[a-z]+_complex_([A-Za-z]+)_.*',
'http://www.openbel.org/bel/complex_([A-Za-z]+)_.*']
for pr in patterns:
match = re.match(pr, uri)
if match is not None:
return match.groups()[0]
return None
[docs]def term_from_uri(uri):
"""Removes prepended URI information from terms."""
if uri is None:
return None
# This insures that if we get a Literal with an integer value (as we
# do for modification positions), it will get converted to a string,
# not an integer.
if isinstance(uri, rdflib.Literal):
uri = str(uri.toPython())
# This is to handle URIs like
# http://www.openbel.org/bel/namespace//MAPK%20Erk1/3%20Family
# or
# http://www.openbel.org/bel/namespace/MAPK%20Erk1/3%20Family
# In the current implementation, the order of the patterns
# matters.
patterns = ['http://www.openbel.org/bel/namespace//(.*)',
'http://www.openbel.org/vocabulary//(.*)',
'http://www.openbel.org/bel//(.*)',
'http://www.openbel.org/bel/namespace/(.*)',
'http://www.openbel.org/vocabulary/(.*)',
'http://www.openbel.org/bel/(.*)']
for pr in patterns:
match = re.match(pr, uri)
if match is not None:
term = match.groups()[0]
term = unquote(term)
return term
# If none of the patterns match then the URI is actually a simple term
# for instance a site: "341" or a substitution: "sub(V,600,E)"
return uri
def strip_statement(uri):
uri = uri.replace(r'http://www.openbel.org/bel/', '')
uri = uri.replace(r'http://www.openbel.org/vocabulary/', '')
return uri
[docs]class BelProcessor(object):
"""The BelProcessor extracts INDRA Statements from a BEL RDF model.
Parameters
----------
g : rdflib.Graph
An RDF graph object containing the BEL model.
Attributes
----------
g : rdflib.Graph
An RDF graph object containing the BEL model.
statements : list[indra.statements.Statement]
A list of extracted INDRA Statements representing direct mechanisms.
This list should be used for assembly in INDRA.
indirect_stmts : list[indra.statements.Statement]
A list of extracted INDRA Statements representing indirect mechanisms.
This list should be used for assembly or model checking in INDRA.
converted_direct_stmts : list[str]
A list of all direct BEL statements, as strings, that were converted
into INDRA Statements.
converted_indirect_stmts : list[str]
A list of all indirect BEL statements, as strings, that were converted
into INDRA Statements.
degenerate_stmts : list[str]
A list of degenerate BEL statements, as strings, in the BEL model.
all_direct_stmts : list[str]
A list of all BEL statements representing direct interactions,
as strings, in the BEL model.
all_indirect_stmts : list[str]
A list of all BEL statements that represent indirect interactions,
as strings, in the BEL model.
"""
def __init__(self, g):
self.g = g
self.statements = []
self.indirect_stmts = []
self.converted_direct_stmts = []
self.converted_indirect_stmts = []
self.degenerate_stmts = []
self.all_direct_stmts = []
self.all_indirect_stmts = []
[docs] def get_modifications(self):
"""Extract INDRA Modification Statements from BEL.
Two SPARQL patterns are used for extracting Modifications from BEL:
- q_phospho1 assumes that the subject is an AbundanceActivity, which
increases/decreases a ModifiedProteinAbundance.
Examples:
kinaseActivity(proteinAbundance(HGNC:IKBKE))
directlyIncreases
proteinAbundance(HGNC:IRF3,proteinModification(P,S,385))
phosphataseActivity(proteinAbundance(HGNC:DUSP4))
directlyDecreases
proteinAbundance(HGNC:MAPK1,proteinModification(P,T,185))
- q_phospho2 assumes that the subject is a ProteinAbundance which
increases/decreases a ModifiedProteinAbundance.
Examples:
proteinAbundance(HGNC:NGF) increases
proteinAbundance(HGNC:NFKBIA,proteinModification(P,Y,42))
proteinAbundance(HGNC:FGF1) decreases
proteinAbundance(HGNC:RB1,proteinModification(P))
"""
# Get statements where the subject is an activity
q_phospho1 = prefixes + """
SELECT ?enzName ?substrateName ?mod ?pos
?stmt ?enzyme ?substrate ?rel
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?subject .
?stmt belvoc:hasObject ?object .
?subject a belvoc:AbundanceActivity .
?subject belvoc:hasChild ?enzyme .
?enzyme a belvoc:ProteinAbundance .
?enzyme belvoc:hasConcept ?enzName .
?object a belvoc:ModifiedProteinAbundance .
?object belvoc:hasModificationType ?mod .
?object belvoc:hasChild ?substrate .
?substrate belvoc:hasConcept ?substrateName .
OPTIONAL { ?object belvoc:hasModificationPosition ?pos . }
}
"""
# Get statements where the subject is a protein abundance
q_phospho2 = prefixes + """
SELECT ?enzName ?substrateName ?mod ?pos
?stmt ?enzyme ?substrate ?rel
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?enzyme .
?stmt belvoc:hasObject ?object .
?enzyme a belvoc:ProteinAbundance .
?enzyme belvoc:hasConcept ?enzName .
?object a belvoc:ModifiedProteinAbundance .
?object belvoc:hasModificationType ?mod .
?object belvoc:hasChild ?substrate .
?substrate belvoc:hasConcept ?substrateName .
OPTIONAL { ?object belvoc:hasModificationPosition ?pos . }
}
"""
for q_phospho in (q_phospho1, q_phospho2):
# Run the query
res_phospho = self.g.query(q_phospho)
for stmt in res_phospho:
# Parse out the elements of the query
evidence = self._get_evidence(stmt[4])
enz = self._get_agent(stmt[0], stmt[5])
#act_type = name_from_uri(stmt[1])
sub = self._get_agent(stmt[1], stmt[6])
mod = term_from_uri(stmt[2])
residue = self._get_residue(mod)
mod_pos = term_from_uri(stmt[3])
stmt_str = strip_statement(stmt[4])
# Get the relationship (increases/decreases, etc.)
rel = term_from_uri(stmt[7])
if rel == 'DirectlyIncreases' or rel == 'DirectlyDecreases':
is_direct = True
else:
is_direct = False
# Build the INDRA statement
# Handle PhosphorylationSerine, etc.
if mod.startswith('Phosphorylation'):
modtype = 'phosphorylation'
else:
modtype = mod.lower()
# Get the class and invert if needed
modclass = modtype_to_modclass[modtype]
if rel == 'DirectlyDecreases' or rel == 'Decreases':
modclass = modclass_to_inverse[modclass]
stmt = modclass(enz, sub, residue, mod_pos, evidence)
if is_direct:
self.statements.append(stmt)
self.converted_direct_stmts.append(stmt_str)
else:
self.converted_indirect_stmts.append(stmt_str)
self.indirect_stmts.append(stmt)
return
[docs] def get_composite_activating_mods(self):
"""Extract INDRA ActiveForm Statements with multiple mods from BEL.
The SPARQL pattern used for extraction from BEL looks for a
CompositeAbundance as subject where two constituents of the composite
are both ModifiedProteinAbundances. The object has to be a
Activity of a ProteinAbundance.
Examples:
compositeAbundance(
proteinAbundance(PFH:"AKT Family",proteinModification(P,S,473)),
proteinAbundance(PFH:"AKT Family",proteinModification(P,T,308)))
directlyIncreases
kinaseActivity(proteinAbundance(PFH:"AKT Family"))
"""
# To eliminate multiple matches, we use pos1 < pos2 but this will
# only work if the pos is given, otherwise multiple matches of
# the same mod combination may appear in the result
q_mods = prefixes + """
SELECT ?speciesName ?actType ?mod1 ?pos1 ?mod2 ?pos2 ?rel ?stmt
?species
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?subject .
?stmt belvoc:hasObject ?object .
?object belvoc:hasActivityType ?actType .
?object belvoc:hasChild ?species .
?species a belvoc:ProteinAbundance .
?species belvoc:hasConcept ?speciesName .
?subject a belvoc:CompositeAbundance .
?subject belvoc:hasChild ?subject1 .
?subject1 a belvoc:ModifiedProteinAbundance .
?subject1 belvoc:hasModificationType ?mod1 .
?subject1 belvoc:hasChild ?species .
?subject belvoc:hasChild ?subject2 .
?subject2 a belvoc:ModifiedProteinAbundance .
?subject2 belvoc:hasModificationType ?mod2 .
?subject2 belvoc:hasChild ?species .
OPTIONAL { ?subject1 belvoc:hasModificationPosition ?pos1 . }
OPTIONAL { ?subject2 belvoc:hasModificationPosition ?pos2 . }
FILTER ((?rel = belvoc:DirectlyIncreases ||
?rel = belvoc:DirectlyDecreases) &&
?pos1 < ?pos2)
}
"""
# Now make the PySB for the phosphorylation
res_mods = self.g.query(q_mods)
for stmt in res_mods:
evidence = self._get_evidence(stmt[7])
# Parse out the elements of the query
species = self._get_agent(stmt[0], stmt[8])
act_type = term_from_uri(stmt[1]).lower()
mod1 = term_from_uri(stmt[2])
mod_pos1 = term_from_uri(stmt[3])
mc1 = self._get_mod_condition(mod1, mod_pos1)
mod2 = term_from_uri(stmt[4])
mod_pos2 = term_from_uri(stmt[5])
mc2 = self._get_mod_condition(mod2, mod_pos2)
species.mods = [mc1, mc2]
rel = term_from_uri(stmt[6])
if rel == 'DirectlyDecreases':
is_active = False
else:
is_active = True
stmt_str = strip_statement(stmt[7])
# Mark this as a converted statement
self.converted_direct_stmts.append(stmt_str)
st = ActiveForm(species, act_type, is_active, evidence)
self.statements.append(st)
[docs] def get_activating_mods(self):
"""Extract INDRA ActiveForm Statements with a single mod from BEL.
The SPARQL pattern used for extraction from BEL looks for a
ModifiedProteinAbundance as subject and an Activiy of a
ProteinAbundance as object.
Examples:
proteinAbundance(HGNC:INSR,proteinModification(P,Y))
directlyIncreases
kinaseActivity(proteinAbundance(HGNC:INSR))
"""
q_mods = prefixes + """
SELECT ?speciesName ?actType ?mod ?pos ?rel ?stmt ?species
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?subject .
?stmt belvoc:hasObject ?object .
?object belvoc:hasActivityType ?actType .
?object belvoc:hasChild ?species .
?species a belvoc:ProteinAbundance .
?species belvoc:hasConcept ?speciesName .
?subject a belvoc:ModifiedProteinAbundance .
?subject belvoc:hasModificationType ?mod .
?subject belvoc:hasChild ?species .
OPTIONAL { ?subject belvoc:hasModificationPosition ?pos . }
FILTER (?rel = belvoc:DirectlyIncreases ||
?rel = belvoc:DirectlyDecreases)
}
"""
# Now make the PySB for the phosphorylation
res_mods = self.g.query(q_mods)
for stmt in res_mods:
evidence = self._get_evidence(stmt[5])
# Parse out the elements of the query
species = self._get_agent(stmt[0], stmt[6])
act_type = term_from_uri(stmt[1]).lower()
mod = term_from_uri(stmt[2])
mod_pos = term_from_uri(stmt[3])
mc = self._get_mod_condition(mod, mod_pos)
species.mods = [mc]
rel = term_from_uri(stmt[4])
if rel == 'DirectlyDecreases':
is_active = False
else:
is_active = True
stmt_str = strip_statement(stmt[5])
# Mark this as a converted statement
self.converted_direct_stmts.append(stmt_str)
st = ActiveForm(species, act_type, is_active, evidence)
self.statements.append(st)
[docs] def get_complexes(self):
"""Extract INDRA Complex Statements from BEL.
The SPARQL query used to extract Complexes looks for ComplexAbundance
terms and their constituents. This pattern is distinct from other
patterns in this processor in that it queries for terms, not
full statements.
Examples:
complexAbundance(proteinAbundance(HGNC:PPARG),
proteinAbundance(HGNC:RXRA))
decreases
biologicalProcess(MESHPP:"Insulin Resistance")
"""
q_cmplx = prefixes + """
SELECT ?complexTerm ?childName ?child ?stmt
WHERE {
{
{?stmt belvoc:hasSubject ?complexTerm}
UNION
{?stmt belvoc:hasObject ?complexTerm .}
UNION
{?stmt belvoc:hasSubject ?term .
?term belvoc:hasChild ?complexTerm .}
UNION
{?stmt belvoc:hasObject ?term .
?term belvoc:hasChild ?complexTerm .}
}
?complexTerm a belvoc:Term .
?complexTerm a belvoc:ComplexAbundance .
?complexTerm belvoc:hasChild ?child .
?child belvoc:hasConcept ?childName .
}
"""
# Run the query
res_cmplx = self.g.query(q_cmplx)
# Store the members of each complex in a dict of lists, keyed by the
# term for the complex
cmplx_dict = collections.defaultdict(list)
cmplx_ev = {}
for stmt in res_cmplx:
stmt_uri = stmt[3]
ev = self._get_evidence(stmt_uri)
for e in ev:
e.epistemics['direct'] = True
cmplx_name = term_from_uri(stmt[0])
cmplx_id = stmt_uri + '#' + cmplx_name
child = self._get_agent(stmt[1], stmt[2])
cmplx_dict[cmplx_id].append(child)
# This might be written multiple times but with the same
# evidence
cmplx_ev[cmplx_id] = ev
# Now iterate over the stored complex information and create binding
# statements
for cmplx_id, cmplx_list in cmplx_dict.items():
if len(cmplx_list) < 2:
msg = 'Complex %s has less than 2 members! Skipping.' % \
cmplx_name
logger.warning(msg)
else:
self.statements.append(Complex(cmplx_list,
evidence=cmplx_ev[cmplx_id]))
[docs] def get_activating_subs(self):
"""Extract INDRA ActiveForm Statements based on a mutation from BEL.
The SPARQL pattern used to extract ActiveForms due to mutations look
for a ProteinAbundance as a subject which has a child encoding the
amino acid substitution. The object of the statement is an
ActivityType of the same ProteinAbundance, which is either increased
or decreased.
Examples:
proteinAbundance(HGNC:NRAS,substitution(Q,61,K))
directlyIncreases
gtpBoundActivity(proteinAbundance(HGNC:NRAS))
proteinAbundance(HGNC:TP53,substitution(F,134,I))
directlyDecreases
transcriptionalActivity(proteinAbundance(HGNC:TP53))
"""
q_mods = prefixes + """
SELECT ?enzyme_name ?sub_label ?act_type ?rel ?stmt ?subject
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?subject .
?stmt belvoc:hasObject ?object .
?subject a belvoc:ProteinAbundance .
?subject belvoc:hasConcept ?enzyme_name .
?subject belvoc:hasChild ?sub_expr .
?sub_expr rdfs:label ?sub_label .
?object a belvoc:AbundanceActivity .
?object belvoc:hasActivityType ?act_type .
?object belvoc:hasChild ?enzyme .
?enzyme a belvoc:ProteinAbundance .
?enzyme belvoc:hasConcept ?enzyme_name .
}
"""
# Now make the PySB for the phosphorylation
res_mods = self.g.query(q_mods)
for stmt in res_mods:
evidence = self._get_evidence(stmt[4])
# Parse out the elements of the query
enz = self._get_agent(stmt[0], stmt[5])
sub_expr = term_from_uri(stmt[1])
act_type = term_from_uri(stmt[2]).lower()
# Parse the WT and substituted residues from the node label.
# Strangely, the RDF for substituted residue doesn't break the
# terms of the BEL expression down into their meaning, as happens
# for modified protein abundances. Instead, the substitution
# just comes back as a string, e.g., "sub(V,600,E)". This code
# parses the arguments back out using a regular expression.
match = re.match('sub\(([A-Z]),([0-9]*),([A-Z])\)', sub_expr)
if match:
matches = match.groups()
wt_residue = matches[0]
position = matches[1]
sub_residue = matches[2]
else:
logger.warning("Could not parse substitution expression %s" %
sub_expr)
continue
mc = MutCondition(position, wt_residue, sub_residue)
enz.mutations = [mc]
rel = strip_statement(stmt[3])
if rel == 'DirectlyDecreases':
is_active = False
else:
is_active = True
stmt_str = strip_statement(stmt[4])
# Mark this as a converted statement
self.converted_direct_stmts.append(stmt_str)
st = ActiveForm(enz, act_type, is_active, evidence)
self.statements.append(st)
[docs] def get_activation(self):
"""Extract INDRA Inhibition/Activation Statements from BEL.
The SPARQL query used to extract Activation Statements looks for
patterns in which the subject is is an ActivityType
(of a ProtainAbundance) or an Abundance (of a small molecule).
The object has to be the ActivityType (typically of a
ProteinAbundance) which is either increased or decreased.
Examples:
abundance(CHEBI:gefitinib) directlyDecreases
kinaseActivity(proteinAbundance(HGNC:EGFR))
kinaseActivity(proteinAbundance(HGNC:MAP3K5))
directlyIncreases kinaseActivity(proteinAbundance(HGNC:MAP2K7))
This pattern covers the extraction of Gap/Gef and GtpActivation
Statements, which are recognized by the object activty or the
subject activity, respectively, being `gtpbound`.
Examples:
catalyticActivity(proteinAbundance(HGNC:RASA1))
directlyDecreases
gtpBoundActivity(proteinAbundance(PFH:"RAS Family"))
catalyticActivity(proteinAbundance(HGNC:SOS1))
directlyIncreases
gtpBoundActivity(proteinAbundance(HGNC:HRAS))
gtpBoundActivity(proteinAbundance(HGNC:HRAS))
directlyIncreases
catalyticActivity(proteinAbundance(HGNC:TIAM1))
"""
q_stmts = prefixes + """
SELECT ?subjName ?subjActType ?rel ?objName ?objActType
?stmt ?subj ?obj
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?subj .
{?subj belvoc:hasActivityType ?subjActType .
?subj belvoc:hasChild ?subjProt .
?subjProt belvoc:hasConcept ?subjName .}
UNION
{?subj a belvoc:Abundance .
?subj belvoc:hasConcept ?subjName .}
?stmt belvoc:hasObject ?obj .
?obj belvoc:hasActivityType ?objActType .
?obj belvoc:hasChild ?objProt .
?objProt belvoc:hasConcept ?objName .
FILTER (?rel = belvoc:DirectlyIncreases ||
?rel = belvoc:DirectlyDecreases)
}
"""
res_stmts = self.g.query(q_stmts)
for stmt in res_stmts:
evidence = self._get_evidence(stmt[5])
subj = self._get_agent(stmt[0], stmt[6])
subj_activity = stmt[1]
if subj_activity:
subj_activity = term_from_uri(stmt[1]).lower()
subj.activity = ActivityCondition(subj_activity, True)
rel = term_from_uri(stmt[2])
if rel == 'DirectlyDecreases':
is_activation = False
else:
is_activation = True
obj = self._get_agent(stmt[3], stmt[7])
obj_activity = term_from_uri(stmt[4]).lower()
stmt_str = strip_statement(stmt[5])
# Mark this as a converted statement
self.converted_direct_stmts.append(stmt_str)
# Distinguish the case when the activator is a GTPase
# (since this may involve unique and stereotyped mechanisms)
if subj_activity == 'gtpbound':
if not is_activation:
logger.warning('GtpActivation only handles positive '
'activation.')
continue
self.statements.append(
GtpActivation(subj, obj, obj_activity, evidence))
# If the object is a GTPase, and the subject *increases*
# its GtpBound activity, then the subject is a GEF
elif obj_activity == 'gtpbound' and rel == 'DirectlyIncreases':
self.statements.append(
Gef(subj, obj, evidence))
# If the object is a GTPase, and the subject *decreases*
# its GtpBound activity, then the subject is a GAP
elif obj_activity == 'gtpbound' and rel == 'DirectlyDecreases':
self.statements.append(
Gap(subj, obj, evidence))
# Otherwise, create a generic Activity->Activity statement
else:
if rel == 'DirectlyDecreases':
st = Inhibition(subj, obj, obj_activity, evidence)
else:
st = Activation(subj, obj, obj_activity, evidence)
self.statements.append(st)
[docs] def get_transcription(self):
"""Extract Increase/DecreaseAmount INDRA Statements from BEL.
Three distinct SPARQL patterns are used to extract amount
regulations from BEL.
- q_tscript1 searches for a subject which is a Transcription
ActivityType of a ProteinAbundance and an object which is
an RNAAbundance that is either increased or decreased.
Examples:
transcriptionalActivity(proteinAbundance(HGNC:FOXP2))
directlyIncreases
rnaAbundance(HGNC:SYK)
transcriptionalActivity(proteinAbundance(HGNC:FOXP2))
directlyDecreases
rnaAbundance(HGNC:CALCRL)
- q_tscript2 searches for a subject which is a ProteinAbundance
and an object which is an RNAAbundance. Note that this pattern
typically exists in an indirect form (i.e. increases/decreases).
Example:
proteinAbundance(HGNC:MTF1) directlyIncreases
rnaAbundance(HGNC:LCN1)
- q_tscript3 searches for a subject which is a
ModifiedProteinAbundance, with an object which is an RNAAbundance.
In the BEL large corpus, this pattern is found for
subjects which are protein families or mouse/rat proteins, and
the predicate in an indirect increase.
Example:
proteinAbundance(PFR:"Akt Family",proteinModification(P))
increases
rnaAbundance(RGD:Cald1)
"""
q_tscript1 = prefixes + """
SELECT ?tfName ?targetName ?stmt ?tf ?target ?rel
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?subject .
?stmt belvoc:hasObject ?target .
?subject a belvoc:AbundanceActivity .
?subject belvoc:hasActivityType belvoc:Transcription .
?subject belvoc:hasChild ?tf .
?tf a belvoc:ProteinAbundance .
?tf belvoc:hasConcept ?tfName .
?target a belvoc:RNAAbundance .
?target belvoc:hasConcept ?targetName .
}
"""
q_tscript2 = prefixes + """
SELECT ?tfName ?targetName ?stmt ?tf ?target ?rel
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?tf .
?stmt belvoc:hasObject ?target .
?tf a belvoc:ProteinAbundance .
?tf belvoc:hasConcept ?tfName .
?target a belvoc:RNAAbundance .
?target belvoc:hasConcept ?targetName .
}
"""
q_tscript3 = prefixes + """
SELECT ?tfName ?targetName ?stmt ?tf ?target ?rel ?mod ?pos
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?subject .
?stmt belvoc:hasObject ?target .
?subject a belvoc:ModifiedProteinAbundance .
?subject belvoc:hasModificationType ?mod .
?subject belvoc:hasChild ?tf .
?tf belvoc:hasConcept ?tfName .
?target a belvoc:RNAAbundance .
?target belvoc:hasConcept ?targetName .
OPTIONAL { ?subject belvoc:hasModificationPosition ?pos . }
}
"""
for q_tscript in (q_tscript1, q_tscript2, q_tscript3):
res_tscript = self.g.query(q_tscript)
for stmt in res_tscript:
# Get modifications on the subject, if any
if q_tscript == q_tscript1:
tf = self._get_agent(stmt[0], stmt[3])
tf.activity = ActivityCondition('transcription', True)
elif q_tscript == q_tscript3:
mod = term_from_uri(stmt[6])
mod_pos = term_from_uri(stmt[7])
mc = self._get_mod_condition(mod, mod_pos)
if mc is None:
continue
tf = self._get_agent(stmt[0], stmt[3])
tf.mods = mods=[mc]
else:
tf = self._get_agent(stmt[0], stmt[3])
# Parse out the elements of the query
evidence = self._get_evidence(stmt[2])
target = self._get_agent(stmt[1], stmt[4])
stmt_str = strip_statement(stmt[2])
# Get the relationship (increases/decreases, etc.)
rel = term_from_uri(stmt[5])
if rel == 'DirectlyIncreases' or rel == 'DirectlyDecreases':
is_direct = True
else:
is_direct = False
# Build the INDRA statement
stmt = None
if rel == 'DirectlyIncreases' or rel == 'Increases':
stmt = IncreaseAmount(tf, target, evidence)
elif rel == 'DirectlyDecreases' or rel == 'Decreases':
stmt = DecreaseAmount(tf, target, evidence)
# If we've matched a pattern, mark this as a converted statement
if stmt is not None:
if is_direct:
self.statements.append(stmt)
self.converted_direct_stmts.append(stmt_str)
else:
self.indirect_stmts.append(stmt)
self.converted_indirect_stmts.append(stmt_str)
[docs] def get_conversions(self):
"""Extract Conversion INDRA Statements from BEL.
The SPARQL query used to extract Conversions searches for
a subject (controller) which is an AbundanceActivity
which directlyIncreases a Reaction with a given list of
Reactants and Products.
Examples:
catalyticActivity(proteinAbundance(HGNC:HMOX1))
directlyIncreases
reaction(reactants(abundance(CHEBI:heme)),
products(abundance(SCHEM:Biliverdine),
abundance(CHEBI:"carbon monoxide")))
"""
query = prefixes + """
SELECT DISTINCT ?controller ?controllerName ?controllerActivity
?product ?productName ?reactant ?reactantName ?stmt
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasRelationship ?rel .
?stmt belvoc:hasSubject ?subject .
?stmt belvoc:hasObject ?rxn .
?subject a belvoc:AbundanceActivity .
?subject belvoc:hasActivityType ?controllerActivity .
?subject belvoc:hasChild ?controller .
?controller belvoc:hasConcept ?controllerName .
?rxn a belvoc:Reaction .
?rxn belvoc:hasChild ?reactants .
?reactants rdfs:label ?reactLabel .
FILTER (regex(?reactLabel, "^reactants.*"))
?rxn belvoc:hasChild ?products .
?products rdfs:label ?prodLabel .
FILTER (regex(?prodLabel, "^products.*"))
?reactants belvoc:hasChild ?reactant .
?products belvoc:hasChild ?product .
?reactant belvoc:hasConcept ?reactantName .
?product belvoc:hasConcept ?productName .
}
"""
res = self.g.query(query)
# We need to collect all pieces of the same statement so that we can
# collect multiple reactants and products
stmt_map = collections.defaultdict(list)
for stmt in res:
stmt_map[stmt[-1]].append(stmt)
for stmts in stmt_map.values():
# First we get the shared part of the Statement
stmt = stmts[0]
subj = self._get_agent(stmt[1], stmt[0])
evidence = self._get_evidence(stmt[-1])
stmt_str = strip_statement(stmt[-1])
# Now we collect the participants
obj_from_map = {}
obj_to_map = {}
for stmt in stmts:
reactant_name = stmt[6]
product_name = stmt[4]
if reactant_name not in obj_from_map:
obj_from_map[reactant_name] = \
self._get_agent(stmt[6], stmt[5])
if product_name not in obj_to_map:
obj_to_map[product_name] = \
self._get_agent(stmt[4], stmt[3])
obj_from = list(obj_from_map.values())
obj_to = list(obj_to_map.values())
st = Conversion(subj, obj_from, obj_to, evidence=evidence)
# If we've matched a pattern, mark this as a converted statement
self.statements.append(st)
self.converted_direct_stmts.append(stmt_str)
[docs] def get_all_direct_statements(self):
"""Get all directlyIncreases/Decreases BEL statements.
This method stores the results of the query in self.all_direct_stmts
as a list of strings. The SPARQL query used to find direct BEL
statements searches for all statements whose predicate is either
DirectyIncreases or DirectlyDecreases.
"""
logger.info("Getting all direct statements...\n")
q_stmts = prefixes + """
SELECT ?stmt
WHERE {
?stmt a belvoc:Statement .
{
{ ?stmt belvoc:hasRelationship belvoc:DirectlyIncreases . }
UNION
{ ?stmt belvoc:hasRelationship belvoc:DirectlyDecreases . }
}
}
"""
res_stmts = self.g.query(q_stmts)
self.all_direct_stmts = [strip_statement(stmt[0]) for stmt in res_stmts]
[docs] def get_all_indirect_statements(self):
"""Get all indirect increases/decreases BEL statements.
This method stores the results of the query in self.all_indirect_stmts
as a list of strings. The SPARQL query used to find indirect BEL
statements searches for all statements whose predicate is either
Increases or Decreases.
"""
q_stmts = prefixes + """
SELECT ?stmt
WHERE {
?stmt a belvoc:Statement .
{
{ ?stmt belvoc:hasRelationship belvoc:Increases . }
UNION
{ ?stmt belvoc:hasRelationship belvoc:Decreases . }
}
}
"""
res_stmts = self.g.query(q_stmts)
self.all_indirect_stmts = [strip_statement(stmt[0]) for stmt in res_stmts]
[docs] def get_degenerate_statements(self):
"""Get all degenerate BEL statements.
Stores the results of the query in self.degenerate_stmts.
"""
logger.info("Checking for 'degenerate' statements...\n")
# Get rules of type protein X -> activity Y
q_stmts = prefixes + """
SELECT ?stmt
WHERE {
?stmt a belvoc:Statement .
?stmt belvoc:hasSubject ?subj .
?stmt belvoc:hasObject ?obj .
{
{ ?stmt belvoc:hasRelationship belvoc:DirectlyIncreases . }
UNION
{ ?stmt belvoc:hasRelationship belvoc:DirectlyDecreases . }
}
{
{ ?subj a belvoc:ProteinAbundance . }
UNION
{ ?subj a belvoc:ModifiedProteinAbundance . }
}
?subj belvoc:hasConcept ?xName .
{
{
?obj a belvoc:ProteinAbundance .
?obj belvoc:hasConcept ?yName .
}
UNION
{
?obj a belvoc:ModifiedProteinAbundance .
?obj belvoc:hasChild ?proteinY .
?proteinY belvoc:hasConcept ?yName .
}
UNION
{
?obj a belvoc:AbundanceActivity .
?obj belvoc:hasChild ?objChild .
?objChild a belvoc:ProteinAbundance .
?objChild belvoc:hasConcept ?yName .
}
}
FILTER (?xName != ?yName)
}
"""
res_stmts = self.g.query(q_stmts)
logger.info("Protein -> Protein/Activity statements:")
logger.info("---------------------------------------")
for stmt in res_stmts:
stmt_str = strip_statement(stmt[0])
logger.info(stmt_str)
self.degenerate_stmts.append(stmt_str)
[docs] def print_statement_coverage(self):
"""Display how many of the direct statements have been converted.
Also prints how many are considered 'degenerate' and not converted."""
if not self.all_direct_stmts:
self.get_all_direct_statements()
if not self.degenerate_stmts:
self.get_degenerate_statements()
if not self.all_indirect_stmts:
self.get_all_indirect_statements()
logger.info('')
logger.info("Total indirect statements: %d" %
len(self.all_indirect_stmts))
logger.info("Converted indirect statements: %d" %
len(self.converted_indirect_stmts))
logger.info(">> Unhandled indirect statements: %d" %
(len(self.all_indirect_stmts) -
len(self.converted_indirect_stmts)))
logger.info('')
logger.info("Total direct statements: %d" % len(self.all_direct_stmts))
logger.info("Converted direct statements: %d" %
len(self.converted_direct_stmts))
logger.info("Degenerate direct statements: %d" %
len(self.degenerate_stmts))
logger.info(">> Unhandled direct statements: %d" %
(len(self.all_direct_stmts) -
len(self.converted_direct_stmts) -
len(self.degenerate_stmts)))
logger.info('')
logger.info("--- Unhandled direct statements ---------")
for stmt in self.all_direct_stmts:
if not (stmt in self.converted_direct_stmts or
stmt in self.degenerate_stmts):
logger.info(stmt)
logger.info('')
logger.info("--- Unhandled indirect statements ---------")
for stmt in self.all_indirect_stmts:
if not (stmt in self.converted_indirect_stmts or
stmt in self.degenerate_stmts):
logger.info(stmt)
[docs] def print_statements(self):
"""Print all extracted INDRA Statements."""
logger.info('--- Direct INDRA statements ----------')
for i, stmt in enumerate(self.statements):
logger.info("%s: %s" % (i, stmt))
logger.info('--- Indirect INDRA statements ----------')
for i, stmt in enumerate(self.indirect_stmts):
logger.info("%s: %s" % (i, stmt))
@staticmethod
def _get_agent(concept, entity):
name = term_from_uri(concept)
namespace = namespace_from_uri(entity)
db_refs = {}
if namespace == 'HGNC':
agent_name = name
hgnc_id = hgnc_client.get_hgnc_id(name)
if hgnc_id is not None:
db_refs['HGNC'] = str(hgnc_id)
up_id = hgnc_client.get_uniprot_id(hgnc_id)
if up_id:
db_refs['UP'] = up_id
else:
logger.warning('HGNC entity %s with HGNC ID %s has no '
'corresponding Uniprot ID.' %
(name, hgnc_id))
else:
logger.warning("Couldn't get HGNC ID for HGNC symbol %s" %
name)
elif namespace in ('MGI', 'RGD'):
agent_name = name
db_refs[namespace] = name
elif namespace in ('PFH', 'SFAM'):
indra_name = bel_to_indra.get(name)
db_refs[namespace] = name
if indra_name is None:
agent_name = name
msg = 'Could not find mapping for BEL family: %s' % name
logger.warning(msg)
else:
db_refs['BE'] = indra_name
db_refs['TEXT'] = name
agent_name = indra_name
elif namespace in ('NCH', 'SCOMP'):
indra_name = bel_to_indra.get(name)
db_refs[namespace] = name
if indra_name is None:
agent_name = name
msg = 'Could not find mapping for BEL complex: %s' % name
logger.warning(msg)
else:
db_refs['BE'] = indra_name
db_refs['TEXT'] = name
agent_name = indra_name
elif namespace == 'CHEBI':
chebi_id = chebi_name_id.get(name)
if chebi_id:
db_refs['CHEBI'] = chebi_id
else:
logger.warning('CHEBI name %s not found in map.' % name)
agent_name = name
elif namespace == 'EGID':
hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
db_refs['EGID'] = name
if hgnc_id is not None:
db_refs['HGNC'] = str(hgnc_id)
agent_name = hgnc_client.get_hgnc_name(hgnc_id)
up_id = hgnc_client.get_uniprot_id(hgnc_id)
if up_id:
db_refs['UP'] = up_id
else:
logger.warning('HGNC entity %s with HGNC ID %s has no '
'corresponding Uniprot ID.' %
(name, hgnc_id))
else:
logger.warning('Could not map EGID%s to HGNC.' % name)
agent_name = 'E%s' % name
else:
logger.warning('Unhandled entity namespace: %s' % namespace)
print('%s, %s' % (concept, entity))
agent_name = name
agent = Agent(agent_name, db_refs=db_refs)
return agent
def _get_evidence(self, statement):
evidence = None
citation = None
annotations = []
# Query for all annotations of the statement
q_annotations = prefixes + """
SELECT ?annotation
WHERE {
<%s> belvoc:hasEvidence ?evidence .
?evidence belvoc:hasAnnotation ?annotation .
}
""" % statement.format()
res_annotations = self.g.query(q_annotations)
for stmt in res_annotations:
annotations.append(stmt[0].format())
# Query for evidence text and citation
q_evidence = prefixes + """
SELECT ?evidenceText ?citation
WHERE {
<%s> belvoc:hasEvidence ?evidence .
?evidence belvoc:hasEvidenceText ?evidenceText .
?evidence belvoc:hasCitation ?citation .
}
""" % statement.format()
res_evidence = self.g.query(q_evidence)
# Query for directness
q_direct = prefixes + """
SELECT ?predicate
WHERE {
<%s> belvoc:hasRelationship ?predicate .
}
""" % statement.format()
res_direct = self.g.query(q_direct)
epistemics = {}
if res_direct:
rel = term_from_uri(list(res_direct)[0][0])
if rel in ('DirectlyDecreases', 'DirectlyIncreases'):
epistemics['direct'] = True
if rel in ('Decreases', 'Increases'):
epistemics['direct'] = False
evs = []
for stmt in res_evidence:
text = stmt[0].toPython()
citation = stmt[1].toPython()
if citation is not None:
m = re.match('.*pubmed:([0-9]+)', citation)
if m is not None:
citation = m.groups()[0]
ev = Evidence(source_api='bel', source_id=statement,
pmid=citation, text=text,
annotations=annotations, epistemics=epistemics)
evs.append(ev)
else:
logger.warning('Could not parse citation: %s' % citation)
if not evs:
evs = [Evidence(source_api='bel', source_id=statement,
annotations=annotations, epistemics=epistemics)]
return evs
@staticmethod
def _get_residue(mod):
if mod.startswith('Phosphorylation'):
if mod == 'Phosphorylation':
residue = None
else:
residue = mod[15:].lower()
residue = get_valid_residue(residue)
else:
residue = None
return residue
@staticmethod
def _get_mod_condition(mod, mod_pos):
if not mod:
return None
if mod.startswith('Phosphorylation'):
mc = ModCondition('phosphorylation')
else:
mc = ModCondition(mod.lower())
mc.residue = BelProcessor._get_residue(mod)
mc.position = mod_pos
return mc
def _build_bioentities_map():
fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../../resources/bioentities_map.tsv')
bel_to_indra = {}
csv_rows = read_unicode_csv(fname, delimiter='\t')
for row in csv_rows:
namespace = row[0]
entry = row[1]
indra_name = row[2]
if namespace == 'BEL':
bel_to_indra[entry] = indra_name
return bel_to_indra
def _build_chebi_map():
fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../../resources/bel_chebi_map.tsv')
chebi_name_id = {}
csv_rows = read_unicode_csv(fname, delimiter='\t')
for row in csv_rows:
chebi_name = row[0]
chebi_id = row[1]
chebi_name_id[chebi_name] = chebi_id
return chebi_name_id
bel_to_indra = _build_bioentities_map()
chebi_name_id = _build_chebi_map()