Source code for indra.sources.gnbr.api

__all__ = ['process_gene_gene', 'process_gene_gene_from_web',
           'process_gene_disease', 'process_gene_disease_from_web',
           'process_chemical_disease', 'process_chemical_disease_from_web',
           'process_chemical_gene', 'process_chemical_gene_from_web',
           'process_from_files', 'process_from_web']

import pandas as pd
import logging
from .processor import GnbrProcessor

base_url = 'https://zenodo.org/record/3459420/files'
logger = logging.getLogger(__name__)


[docs]def process_gene_gene(part1_path: str, part2_path: str,
                      indicator_only: bool = True) -> GnbrProcessor:
    """Process gene–gene interactions.

    Parameters
    ----------
    part1_path :
        Path to the first dataset which contains dependency paths and themes.
    part2_path :
        Path to the second dataset which contains dependency paths and entity
        pairs.
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    return process_from_files(part1_path, part2_path, 'gene', 'gene',
                              indicator_only=indicator_only)


[docs]def process_chemical_gene(part1_path: str, part2_path: str,
                          indicator_only: bool = True) -> GnbrProcessor:
    """Process chemical–gene interactions.

    Parameters
    ----------
    part1_path :
        Path to the first dataset of dependency paths and themes.
    part2_path :
        Path to the second dataset of dependency paths and entity pairs.
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted
        INDRA Statements in its statements attribute.
    """
    return process_from_files(part1_path, part2_path, 'chemical', 'gene',
                              indicator_only=indicator_only)


[docs]def process_gene_disease(part1_path: str, part2_path: str,
                         indicator_only: bool = True) -> GnbrProcessor:
    """Process gene–disease interactions.

    Parameters
    ----------
    part1_path :
        Path to the first dataset which contains dependency paths and themes.
    part2_path :
        Path to the second dataset which contains dependency paths and entity
        pairs.
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    return process_from_files(part1_path, part2_path, 'gene', 'disease',
                              indicator_only=indicator_only)


[docs]def process_chemical_disease(part1_path: str, part2_path: str,
                             indicator_only: bool = True) \
        -> GnbrProcessor:
    """Process chemical–disease interactions.

    Parameters
    ----------
    part1_path :
        Path to the first dataset which contains dependency paths and themes.
    part2_path :
        Path to the second dataset which contains dependency paths and entity
        pairs.
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    return process_from_files(part1_path, part2_path, 'chemical', 'disease',
                              indicator_only=indicator_only)


[docs]def process_from_files(part1_path: str, part2_path: str, first_type: str,
                       second_type: str, indicator_only: bool = True) \
        -> GnbrProcessor:
    """Loading the databases from the given files.

    Parameters
    ----------
    part1_path :
        Path to the first dataset which contains dependency paths and themes.
    part2_path :
        Path to the second dataset which contains dependency paths and themes.
    first_type :
        Type of the first agent.
    second_type :
        Type of the second agent.
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    logger.info(f'Loading part 1 table from {part1_path}')
    df1: pd.DataFrame = pd.read_csv(part1_path, sep='\t')
    logger.info(f'Loading part 2 table from {part2_path}')
    df2: pd.DataFrame = pd.read_csv(part2_path, sep='\t', header=None)
    gp: GnbrProcessor = GnbrProcessor(df1, df2, first_type, second_type,
                                      indicator_only=indicator_only)
    gp.extract_stmts()
    return gp


[docs]def process_gene_gene_from_web(indicator_only: bool = True) -> GnbrProcessor:
    """Call process_gene_gene function on the GNBR datasets.

    Parameters
    ----------
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    return process_from_web('gene', 'gene', indicator_only=indicator_only)


[docs]def process_chemical_gene_from_web(indicator_only: bool = True) \
        -> GnbrProcessor:
    """Call process_chemical_gene function on the GNBR datasets.

    Parameters
    ----------
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    return process_from_web('chemical', 'gene', indicator_only=indicator_only)


[docs]def process_gene_disease_from_web(indicator_only: bool = True) -> GnbrProcessor:
    """Call process_gene_disease function on the GNBR datasets.

    Parameters
    ----------
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    return process_from_web('gene', 'disease', indicator_only=indicator_only)


[docs]def process_chemical_disease_from_web(indicator_only: bool = True)\
        -> GnbrProcessor:
    """Call process_chemical_disease function on the GNBR datasets.

    Parameters
    ----------
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    return process_from_web('chemical', 'disease',
                             indicator_only=indicator_only)


[docs]def process_from_web(first_type, second_type, indicator_only: bool = True)\
        -> GnbrProcessor:
    """Loading the databases from the given urls.

    Parameters
    ----------
    first_type :
        Type of the first agent.
    second_type :
        Type of the second agent.
    indicator_only :
        A switch to filter the data which is part of the flagship path set
        for each theme.

    Returns
    -------
    :
        A GnbrProcessor object which contains a list of extracted INDRA
        Statements in its statements attribute.
    """
    fname1 = (f'{base_url}/part-i-{first_type}-{second_type}-path-theme-'
              f'distributions.txt.gz')
    fname2 = (f'{base_url}/part-ii-dependency-paths-{first_type}-{second_type}'
              f'-sorted-with-themes.txt.gz')
    return process_from_files(fname1, fname2, first_type, second_type,
                               indicator_only=indicator_only)