Source code for indra.sources.wormbase.api

__all__ = ['process_from_files', 'process_from_web']


from .processor import WormBaseProcessor
from collections import namedtuple
import pandas as pd

# Url for all C. elegans molecular interactions data file
wormbase_mol_file_url = ('https://fms.alliancegenome.org/download/'
                         'INTERACTION-MOL_WB.tsv.gz')
# Url for all C. elegans genetic interactions data file
wormbase_gen_file_url = ('https://fms.alliancegenome.org/download/'
                         'INTERACTION-GEN_WB.tsv.gz')
# Url for wormbase-to-entrez ID mapping
wormbase_entrez_mappings_file_url = ('https://ftp.ncbi.nih.gov/gene/'
                                     'DATA/GENE_INFO/Invertebrates/'
                                     'Caenorhabditis_elegans.gene_info.gz')

# An explanation for each column of the interaction files are here:
# https://github.com/HUPO-PSI/miTab/blob/master/PSI-MITAB27Format.md
columns = ['ids_interactor_a', 'ids_interactor_b',
           'alt_ids_interactor_a', 'alt_ids_interactor_b',
           'aliases_interactor_a', 'aliases_interactor_b',
           'interaction_detection_methods', 'publication_first_authors',
           'publication_identifiers', 'taxid_interactor_a',
           'taxid_interactor_b', 'interaction_types',
           'source_databases', 'interaction_identifiers',
           'confidence_values', 'expansion_methods',
           'biological_roles_interactor_a',
           'biological_roles_interactor_b',
           'experimental_roles_interactor_a',
           'experimental_roles_interactor_b',
           'types_interactor_a', 'types_interactor_b',
           'xrefs_interactor_a', 'xrefs_interactor_b',
           'interaction_xrefs', 'annotations_interactor_a',
           'annotations_interactor_b', 'interaction_annotations',
           'host_organisms', 'interaction_parameters',
           'creation_date', 'update_date', 'checksums_interactor_a',
           'checksums_interactor_b', 'interaction_checksums',
           'negative', 'features_interactor_a', 'features_interactor_b',
           'stoichiometries_interactor_a', 'stoichiometries_interactor_b',
           'identification_method_participant_a',
           'identification_method_participant_b']

mapping_columns = ['tax_id', 'GeneID', 'Symbol',
                   'LocusTag', 'Synonyms', 'dbXrefs', 'chromosome',
                   'map_location', 'description', 'type_of_gene',
                   'Symbol_from_nomenclature_authority',
                   'Full_name_from_nomenclature_authority',
                   'Nomenclature_status', 'Other_designations',
                   'Modification_date', 'Feature_type']

_WormBaseRow = namedtuple('WormBaseRow', columns)


[docs] def process_from_files(wormbase_gen_data_file, wormbase_mol_data_file, wb_to_entrez_mappings_file): """Process WormBase interaction data from TSV files. Parameters ---------- wormbase_gen_data_file : str Path to the WormBase genetic interactions data file in TSV format. wormbase_mol_data_file : str Path to the WormBase molecular interactions data file in TSV format. wb_to_entrez_mappings_file : str Path to the WormBase-to-Entrez ID mapping file in TSV format. Returns ------- indra.sources.wormbase.WormBaseProcessor WormBaseProcessor containing Statements extracted from the interactions data. """ gen_iter = pd.read_csv(wormbase_gen_data_file, sep='\t', comment='#', dtype=str).values.tolist() mol_iter = pd.read_csv(wormbase_mol_data_file, sep='\t', comment='#', dtype=str).values.tolist() mappings_df = pd.read_csv(wb_to_entrez_mappings_file, sep='\t', comment='#', dtype=str, names=mapping_columns) return _processor_from_data(gen_iter, mol_iter, mappings_df)
[docs] def process_from_web(): """Process WormBase interaction data from the web. Returns ------- indra.sources.wormbase.WormBaseProcessor WormBaseProcessor containing Statements extracted from the interactions data. """ gen_iter = pd.read_csv(wormbase_gen_file_url, sep='\t', comment='#', dtype=str).values.tolist() mol_iter = pd.read_csv(wormbase_mol_file_url, sep='\t', comment='#', dtype=str).values.tolist() mappings_df = pd.read_csv(wormbase_entrez_mappings_file_url, sep='\t', comment='#', dtype=str, names=mapping_columns) return _processor_from_data(gen_iter, mol_iter, mappings_df)
def _processor_from_data(gen_iter, mol_iter, mappings_df): """Create a WormBaseProcessor from the interaction data and ID mappings. Parameters ---------- gen_iter : list Iterable of rows in the genetic interactions data file. mol_iter : list Iterable of rows in the molecular interactions data file. mappings_df : pd.DataFrame DataFrame containing associated WormBase and Entrez IDs. Returns ------- indra.sources.wormbase.WormBaseProcessor WormBaseProcessor containing Statements extracted from the interactions data. """ # Process into a list of WormBaseRow namedtuples all_rows = gen_iter + mol_iter data = [_WormBaseRow(*[None if item == '-' else item for item in row][:len(columns)]) for row in all_rows] return WormBaseProcessor(data, mappings_df)