Source code for indra.sources.bel.api

# -*- coding: utf-8 -*-

"""High level API functions for the PyBEL processor."""

import zlib
import json
import pybel
import logging
import requests
from functools import lru_cache

import pybel.constants as pc
from pybel.io.sbel import add_sbel_row

from .processor import PybelProcessor


logger = logging.getLogger(__name__)

version = 'v1.0.0'
branch = 'https://github.com/cthoyt/selventa-knowledge/raw/' \
         '{}/selventa_knowledge/{}'
large_corpus_url = branch.format(version, 'large_corpus.bel.nodelink.json.gz')
small_corpus_url = branch.format(version, 'small_corpus.bel.nodelink.json.gz')


[docs]def process_small_corpus():
    """Return PybelProcessor with statements from Selventa Small Corpus.

    Returns
    -------
    bp : PybelProcessor
        A PybelProcessor object which contains INDRA Statements in
        its statements attribute.
    """
    return process_pybel_network(network_type='graph_jsongz_url',
                                 network_file=small_corpus_url)


[docs]def process_large_corpus():
    """Return PybelProcessor with statements from Selventa Large Corpus.

    Returns
    -------
    bp : PybelProcessor
        A PybelProcessor object which contains INDRA Statements in
        its statements attribute.
    """
    return process_pybel_network(network_type='graph_jsongz_url',
                                 network_file=large_corpus_url)


[docs]def process_pybel_network(network_type, network_file, **kwargs):
    """Return PybelProcessor by processing a given network file.

    Parameters
    ----------
    network_type : str
        The type of network that network_file is. The options are:
        belscript, json, cbn_jgif, graph_pickle, and graph_jsongz_url.
        Default: graph_jsongz_url
    network_file : str
        Path to the network file/URL to process.

    Returns
    -------
    bp : PybelProcessor
        A PybelProcessor object which contains INDRA Statements in
        bp.statements.
    """
    if network_type == 'belscript':
        return process_belscript(network_file, **kwargs)
    elif network_type == 'json':
        return process_json_file(network_file)
    elif network_type == 'cbn_jgif':
        return process_cbn_jgif_file(network_file)
    elif network_type == 'graph_jsongz_url':
        if not network_file:
            network_file = large_corpus_url
        logger.info('Loading %s' % network_file)
        res = requests.get(network_file)
        res.raise_for_status()
        contentb = zlib.decompress(res.content, zlib.MAX_WBITS | 32)
        content = contentb.decode('utf-8')
        graph = pybel.from_nodelink_jsons(content)
        return process_pybel_graph(graph)
    elif network_type == 'graph_pickle':
        graph = pybel.from_pickle(network_file)
        return process_pybel_graph(graph)
    else:
        raise ValueError('Unknown network type: %s' % network_type)


[docs]def process_pybel_neighborhood(entity_names, network_type='graph_jsongz_url',
                               network_file=None, **kwargs):
    """Return PybelProcessor around neighborhood of given genes in a network.

    This function processes the given network file and filters the returned
    Statements to ones that contain genes in the given list.

    Parameters
    ----------
    entity_names : list[str]
        A list of entity names (e.g., gene names) which will be used as the
        basis of filtering the result. If any of the Agents of an extracted
        INDRA Statement has a name appearing in this list, the Statement is
        retained in the result.
    network_type : Optional[str]
        The type of network that network_file is. The options are:
        belscript, json, cbn_jgif, graph_pickle, and graph_jsongz_url.
        Default: graph_jsongz_url
    network_file : Optional[str]
        Path to the network file/URL to process. If not given, by default, the
        Selventa Large Corpus is used via a URL pointing to a gzipped PyBEL
        Graph JSON file.

    Returns
    -------
    bp : PybelProcessor
        A PybelProcessor object which contains INDRA Statements in
        bp.statements.
    """
    bp = process_pybel_network(network_type, network_file, **kwargs)
    filtered_stmts = []
    filter_names = set(entity_names)
    for stmt in bp.statements:
        found = False
        for agent in stmt.agent_list():
            if agent is not None:
                if agent.name in filter_names:
                    found = True
        if found:
            filtered_stmts.append(stmt)

    bp.statements = filtered_stmts

    return bp


[docs]def process_bel_stmt(bel: str, squeeze: bool = False):
    """Process a single BEL statement and return the PybelProcessor
    or a single statement if ``squeeze`` is True.

    Parameters
    ----------
    bel : str
        A BEL statement. See example below.
    squeeze : Optional[bool]
        If squeeze and there's only one statement in the processor,
        it will be unpacked.

    Returns
    -------
    statements : Union[Statement, PybelProcessor]
        A list of INDRA statments derived from the BEL statement.
        If squeeze is true and there was only one statement, the
        unpacked INDRA statement will be returned.

    Examples
    --------
    >>> from indra.sources.bel import process_bel_stmt
    >>> bel_s = 'kin(p(FPLX:MEK)) -> kin(p(FPLX:ERK))'
    >>> process_bel_stmt(bel_s, squeeze=True)
    Activation(MEK(kinase), ERK(), kinase)
    """
    r = pybel.parse(bel)
    # make sure activations in the right place
    for a, b in [(pc.SOURCE, pc.SOURCE_MODIFIER), (pc.TARGET, pc.TARGET_MODIFIER)]:
        side = r[a]
        for c in [pc.MODIFIER, pc.EFFECT, pc.FROM_LOC, pc.TO_LOC, pc.LOCATION]:
            if c in side:
                r.setdefault(b, {})[c] = side.pop(c)
    graph = pybel.BELGraph()
    add_sbel_row(graph, r)
    bp = process_pybel_graph(graph)
    if squeeze and len(bp.statements) == 1:
        return bp.statements[0]
    return bp


[docs]@lru_cache(maxsize=100)
def process_pybel_graph(graph):
    """Return a PybelProcessor by processing a PyBEL graph.

    Parameters
    ----------
    graph : pybel.struct.BELGraph
        A PyBEL graph to process

    Returns
    -------
    bp : PybelProcessor
        A PybelProcessor object which contains INDRA Statements in
        bp.statements.
    """
    bp = PybelProcessor(graph)
    bp.get_statements()
    if bp.annot_manager.failures:
        logger.warning('missing %d annotation pairs',
                       sum(len(v)
                           for v in bp.annot_manager.failures.values()))
    return bp


[docs]def process_belscript(file_name, **kwargs):
    """Return a PybelProcessor by processing a BEL script file.

    Key word arguments are passed directly to pybel.from_path,
    for further information, see
    pybel.readthedocs.io/en/latest/io.html#pybel.from_path
    Some keyword arguments we use here differ from the defaults
    of PyBEL, namely we set `citation_clearing` to False
    and `no_identifier_validation` to True.

    Parameters
    ----------
    file_name : str
        The path to a BEL script file.

    Returns
    -------
    bp : PybelProcessor
        A PybelProcessor object which contains INDRA Statements in
        bp.statements.
    """
    if 'citation_clearing' not in kwargs:
        kwargs['citation_clearing'] = False
    if 'no_identifier_validation' not in kwargs:
        kwargs['no_identifier_validation'] = True
    pybel_graph = pybel.from_bel_script(file_name, **kwargs)
    return process_pybel_graph(pybel_graph)


[docs]def process_json_file(file_name):
    """Return a PybelProcessor by processing a Node-Link JSON file.

    For more information on this format, see:
    http://pybel.readthedocs.io/en/latest/io.html#node-link-json

    Parameters
    ----------
    file_name : str
        The path to a Node-Link JSON file.

    Returns
    -------
    bp : PybelProcessor
        A PybelProcessor object which contains INDRA Statements in
        bp.statements.
    """
    pybel_graph = pybel.from_nodelink_file(file_name, check_version=False)
    return process_pybel_graph(pybel_graph)


[docs]def process_cbn_jgif_file(file_name):
    """Return a PybelProcessor by processing a CBN JGIF JSON file.

    Parameters
    ----------
    file_name : str
        The path to a CBN JGIF JSON file.

    Returns
    -------
    bp : PybelProcessor
        A PybelProcessor object which contains INDRA Statements in
        bp.statements.
    """
    with open(file_name, 'r') as jgf:
        return process_pybel_graph(pybel.from_cbn_jgif(json.load(jgf)))