Source code for indra.sources.wormbase.api

__all__ = ['process_from_files', 'process_from_web']


from .processor import WormBaseProcessor
from collections import namedtuple
import pandas as pd

# Url for all C. elegans molecular interactions data file
wormbase_mol_file_url = ('https://fms.alliancegenome.org/download/'
                         'INTERACTION-MOL_WB.tsv.gz')
# Url for all C. elegans genetic interactions data file
wormbase_gen_file_url = ('https://fms.alliancegenome.org/download/'
                         'INTERACTION-GEN_WB.tsv.gz')
# Url for wormbase-to-entrez ID mapping
wormbase_entrez_mappings_file_url = ('https://ftp.ncbi.nih.gov/gene/'
                                     'DATA/GENE_INFO/Invertebrates/'
                                     'Caenorhabditis_elegans.gene_info.gz')

# An explanation for each column of the interaction files are here:
# https://github.com/HUPO-PSI/miTab/blob/master/PSI-MITAB27Format.md
columns = ['ids_interactor_a', 'ids_interactor_b',
           'alt_ids_interactor_a', 'alt_ids_interactor_b',
           'aliases_interactor_a', 'aliases_interactor_b',
           'interaction_detection_methods', 'publication_first_authors',
           'publication_identifiers', 'taxid_interactor_a',
           'taxid_interactor_b', 'interaction_types',
           'source_databases', 'interaction_identifiers',
           'confidence_values', 'expansion_methods',
           'biological_roles_interactor_a',
           'biological_roles_interactor_b',
           'experimental_roles_interactor_a',
           'experimental_roles_interactor_b',
           'types_interactor_a', 'types_interactor_b',
           'xrefs_interactor_a', 'xrefs_interactor_b',
           'interaction_xrefs', 'annotations_interactor_a',
           'annotations_interactor_b', 'interaction_annotations',
           'host_organisms', 'interaction_parameters',
           'creation_date', 'update_date', 'checksums_interactor_a',
           'checksums_interactor_b', 'interaction_checksums',
           'negative', 'features_interactor_a', 'features_interactor_b',
           'stoichiometries_interactor_a', 'stoichiometries_interactor_b',
           'identification_method_participant_a',
           'identification_method_participant_b']

mapping_columns = ['tax_id', 'GeneID', 'Symbol',
                   'LocusTag', 'Synonyms', 'dbXrefs', 'chromosome',
                   'map_location', 'description', 'type_of_gene',
                   'Symbol_from_nomenclature_authority',
                   'Full_name_from_nomenclature_authority',
                   'Nomenclature_status', 'Other_designations',
                   'Modification_date', 'Feature_type']

_WormBaseRow = namedtuple('WormBaseRow', columns)



[docs]
def process_from_files(wormbase_gen_data_file, wormbase_mol_data_file,
                       wb_to_entrez_mappings_file):
    """Process WormBase interaction data from TSV files.

    Parameters
    ----------
    wormbase_gen_data_file : str
        Path to the WormBase genetic interactions data file in TSV format.
    wormbase_mol_data_file : str
        Path to the WormBase molecular interactions data file in TSV format.
    wb_to_entrez_mappings_file : str
        Path to the WormBase-to-Entrez ID mapping file in TSV format.

    Returns
    -------
    indra.sources.wormbase.WormBaseProcessor
        WormBaseProcessor containing Statements extracted from the
        interactions data.
    """
    gen_iter = pd.read_csv(wormbase_gen_data_file, sep='\t', comment='#',
                           dtype=str).values.tolist()
    mol_iter = pd.read_csv(wormbase_mol_data_file, sep='\t', comment='#',
                           dtype=str).values.tolist()
    mappings_df = pd.read_csv(wb_to_entrez_mappings_file, sep='\t',
                              comment='#', dtype=str, names=mapping_columns)
    return _processor_from_data(gen_iter, mol_iter, mappings_df)




[docs]
def process_from_web():
    """Process WormBase interaction data from the web.

    Returns
    -------
    indra.sources.wormbase.WormBaseProcessor
        WormBaseProcessor containing Statements extracted from the interactions data.
    """
    gen_iter = pd.read_csv(wormbase_gen_file_url, sep='\t', comment='#',
                           dtype=str).values.tolist()
    mol_iter = pd.read_csv(wormbase_mol_file_url, sep='\t', comment='#',
                           dtype=str).values.tolist()
    mappings_df = pd.read_csv(wormbase_entrez_mappings_file_url, sep='\t',
                              comment='#', dtype=str,
                              names=mapping_columns)

    return _processor_from_data(gen_iter, mol_iter, mappings_df)



def _processor_from_data(gen_iter, mol_iter, mappings_df):
    """Create a WormBaseProcessor from the interaction data and ID mappings.

    Parameters
    ----------
    gen_iter : list
        Iterable of rows in the genetic interactions data file.
    mol_iter : list
        Iterable of rows in the molecular interactions data file.
    mappings_df : pd.DataFrame
        DataFrame containing associated WormBase and Entrez IDs.

    Returns
    -------
    indra.sources.wormbase.WormBaseProcessor
        WormBaseProcessor containing Statements extracted from the interactions data.
    """
    # Process into a list of WormBaseRow namedtuples
    all_rows = gen_iter + mol_iter
    data = [_WormBaseRow(*[None if item == '-' else item
                           for item in row][:len(columns)])
            for row in all_rows]
    return WormBaseProcessor(data, mappings_df)