# -*- coding: utf-8 -*-
"""High level API functions for the PyBEL processor."""
import zlib
import json
import pybel
import logging
import requests
from functools import lru_cache
import pybel.constants as pc
from pybel.io.sbel import add_sbel_row
from .processor import PybelProcessor
logger = logging.getLogger(__name__)
version = 'v1.1.2'
branch = 'https://github.com/cthoyt/selventa-knowledge/raw/' \
'{}/selventa_knowledge/{}'
large_corpus_url = branch.format(version, 'large_corpus.bel.nodelink.json.gz')
small_corpus_url = branch.format(version, 'small_corpus.bel.nodelink.json.gz')
[docs]def process_small_corpus():
"""Return PybelProcessor with statements from Selventa Small Corpus.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
its statements attribute.
"""
return process_pybel_network(network_type='graph_jsongz_url',
network_file=small_corpus_url)
[docs]def process_large_corpus():
"""Return PybelProcessor with statements from Selventa Large Corpus.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
its statements attribute.
"""
return process_pybel_network(network_type='graph_jsongz_url',
network_file=large_corpus_url)
[docs]def process_pybel_network(network_type, network_file, **kwargs):
"""Return PybelProcessor by processing a given network file.
Parameters
----------
network_type : str
The type of network that network_file is. The options are:
belscript, json, cbn_jgif, graph_pickle, and graph_jsongz_url.
Default: graph_jsongz_url
network_file : str
Path to the network file/URL to process.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
if network_type == 'belscript':
return process_belscript(network_file, **kwargs)
elif network_type == 'json':
return process_json_file(network_file)
elif network_type == 'cbn_jgif':
return process_cbn_jgif_file(network_file)
elif network_type == 'graph_jsongz_url':
if not network_file:
network_file = large_corpus_url
logger.info('Loading %s' % network_file)
res = requests.get(network_file)
res.raise_for_status()
contentb = zlib.decompress(res.content, zlib.MAX_WBITS | 32)
content = contentb.decode('utf-8')
graph = pybel.from_nodelink_jsons(content)
return process_pybel_graph(graph)
elif network_type == 'graph_pickle':
graph = pybel.from_pickle(network_file)
return process_pybel_graph(graph)
else:
raise ValueError('Unknown network type: %s' % network_type)
[docs]def process_pybel_neighborhood(entity_names, network_type='graph_jsongz_url',
network_file=None, **kwargs):
"""Return PybelProcessor around neighborhood of given genes in a network.
This function processes the given network file and filters the returned
Statements to ones that contain genes in the given list.
Parameters
----------
entity_names : list[str]
A list of entity names (e.g., gene names) which will be used as the
basis of filtering the result. If any of the Agents of an extracted
INDRA Statement has a name appearing in this list, the Statement is
retained in the result.
network_type : Optional[str]
The type of network that network_file is. The options are:
belscript, json, cbn_jgif, graph_pickle, and graph_jsongz_url.
Default: graph_jsongz_url
network_file : Optional[str]
Path to the network file/URL to process. If not given, by default, the
Selventa Large Corpus is used via a URL pointing to a gzipped PyBEL
Graph JSON file.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
bp = process_pybel_network(network_type, network_file, **kwargs)
filtered_stmts = []
filter_names = set(entity_names)
for stmt in bp.statements:
found = False
for agent in stmt.agent_list():
if agent is not None:
if agent.name in filter_names:
found = True
if found:
filtered_stmts.append(stmt)
bp.statements = filtered_stmts
return bp
[docs]def process_bel_stmt(bel: str, squeeze: bool = False):
"""Process a single BEL statement and return the PybelProcessor
or a single statement if ``squeeze`` is True.
Parameters
----------
bel : str
A BEL statement. See example below.
squeeze : Optional[bool]
If squeeze and there's only one statement in the processor,
it will be unpacked.
Returns
-------
statements : Union[Statement, PybelProcessor]
A list of INDRA statments derived from the BEL statement.
If squeeze is true and there was only one statement, the
unpacked INDRA statement will be returned.
Examples
--------
>>> from indra.sources.bel import process_bel_stmt
>>> bel_s = 'kin(p(FPLX:MEK)) -> kin(p(FPLX:ERK))'
>>> process_bel_stmt(bel_s, squeeze=True)
Activation(MEK(kinase), ERK(), kinase)
"""
r = pybel.parse(bel)
# make sure activations in the right place
for a, b in [(pc.SOURCE, pc.SOURCE_MODIFIER), (pc.TARGET, pc.TARGET_MODIFIER)]:
side = r[a]
for c in [pc.MODIFIER, pc.EFFECT, pc.FROM_LOC, pc.TO_LOC, pc.LOCATION]:
if c in side:
r.setdefault(b, {})[c] = side.pop(c)
graph = pybel.BELGraph()
add_sbel_row(graph, r)
bp = process_pybel_graph(graph)
if squeeze and len(bp.statements) == 1:
return bp.statements[0]
return bp
[docs]@lru_cache(maxsize=100)
def process_pybel_graph(graph):
"""Return a PybelProcessor by processing a PyBEL graph.
Parameters
----------
graph : pybel.struct.BELGraph
A PyBEL graph to process
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
bp = PybelProcessor(graph)
bp.get_statements()
if bp.annot_manager.failures:
logger.warning('missing %d annotation pairs',
sum(len(v)
for v in bp.annot_manager.failures.values()))
return bp
[docs]def process_belscript(file_name, **kwargs):
"""Return a PybelProcessor by processing a BEL script file.
Key word arguments are passed directly to pybel.from_path,
for further information, see
pybel.readthedocs.io/en/latest/io.html#pybel.from_path
Some keyword arguments we use here differ from the defaults
of PyBEL, namely we set `citation_clearing` to False
and `no_identifier_validation` to True.
Parameters
----------
file_name : str
The path to a BEL script file.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
if 'citation_clearing' not in kwargs:
kwargs['citation_clearing'] = False
if 'no_identifier_validation' not in kwargs:
kwargs['no_identifier_validation'] = True
pybel_graph = pybel.from_bel_script(file_name, **kwargs)
return process_pybel_graph(pybel_graph)
[docs]def process_json_file(file_name):
"""Return a PybelProcessor by processing a Node-Link JSON file.
For more information on this format, see:
http://pybel.readthedocs.io/en/latest/io.html#node-link-json
Parameters
----------
file_name : str
The path to a Node-Link JSON file.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
pybel_graph = pybel.from_nodelink_file(file_name, check_version=False)
return process_pybel_graph(pybel_graph)
[docs]def process_cbn_jgif_file(file_name):
"""Return a PybelProcessor by processing a CBN JGIF JSON file.
Parameters
----------
file_name : str
The path to a CBN JGIF JSON file.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
with open(file_name, 'r') as jgf:
return process_pybel_graph(pybel.from_cbn_jgif(json.load(jgf)))