Source code for indra.sources.biogrid

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
import re
import csv
import logging
import itertools
import requests
from io import BytesIO, StringIO
from zipfile import ZipFile
from collections import namedtuple
from indra.util import read_unicode_csv
from indra.statements import *
import indra.databases.hgnc_client as hgnc_client
import indra.databases.uniprot_client as up_client

logger = logging.getLogger(__name__)


biogrid_file_url = 'https://downloads.thebiogrid.org/Download/BioGRID/' + \
        'Release-Archive/BIOGRID-3.4.158/BIOGRID-ALL-3.4.158.tab2.zip'


# The explanation for each column of the tsv file is here:
# https://wiki.thebiogrid.org/doku.php/biogrid_tab_version_2.0
_BiogridRow = namedtuple('BiogridRow',
                        ['biogrid_int_id',
                         'entrez_a', 'entrez_b',
                         'biogrid_a', 'biogrid_b',
                         'syst_name_a', 'syst_name_b',
                         'hgnc_a', 'hgnc_b',
                         'syn_a', 'syn_b',
                         'exp_system', 'exp_system_type',
                         'author', 'pmid',
                         'organism_a', 'organism_b',
                         'throughput', 'score', 'modification',
                         'phenotypes', 'qualifications', 'tags',
                         'source_db'])


[docs]class BiogridProcessor(object): """Extracts INDRA Complex statements from Biogrid interaction data. Parameters ---------- biogrid_file : str The file containing the Biogrid data in .tab2 format. If not provided, the BioGrid data is downloaded from the BioGrid website. physical_only : boolean If True, only physical interactions are included (e.g., genetic interactions are excluded). If False, all interactions are included). Attributes ---------- statements: list[indra.statements.Statements] Extracted INDRA Complex statements. physical_only : boolean Indicates whether only physical interactions were included during statement processing. """ def __init__(self, biogrid_file=None, physical_only=True): self.statements = [] self.physical_only = physical_only # If a path to the file is included, process it, skipping the header if biogrid_file: rows = read_unicode_csv(biogrid_file, '\t', skiprows=1) # If no file is provided, download from web else: logger.info('No data file specified, downloading from BioGrid ' 'at %s' % biogrid_file_url) rows = _download_biogrid_data(biogrid_file_url) # Process the rows into Statements for row in rows: filt_row = [None if item == '-' else item for item in row] bg_row = _BiogridRow(*filt_row) # Filter out non-physical interactions if desired if self.physical_only and bg_row.exp_system_type != 'physical': continue # Ground agents agent_a = self._make_agent(bg_row.entrez_a, bg_row.syst_name_a) agent_b = self._make_agent(bg_row.entrez_b, bg_row.syst_name_b) # Skip any agents with neither HGNC grounding or string name if agent_a is None or agent_b is None: continue # Get evidence ev = Evidence(source_api='biogrid', source_id=bg_row.biogrid_int_id, pmid=bg_row.pmid, text=None, annotations=dict(bg_row._asdict())) # Make statement s = Complex([agent_a, agent_b], evidence=ev) self.statements.append(s) def _make_agent(self, entrez_id, text_id): """Make an Agent object, appropriately grounded. Parameters ---------- entrez_id : str Entrez id number text_id : str A plain text systematic name, or None if not listed. Returns ------- agent : indra.statements.Agent A grounded agent object. """ hgnc_name, db_refs = self._make_db_refs(entrez_id, text_id) if hgnc_name is not None: name = hgnc_name elif text_id is not None: name = text_id # Handle case where the name is None else: return None return Agent(name, db_refs=db_refs) def _make_db_refs(self, entrez_id, text_id): """Looks up the HGNC ID and name, as well as the Uniprot ID. Parameters ---------- entrez_id : str Entrez gene ID. text_id : str or None A plain text systematic name, or None if not listed in the Biogrid data. Returns ------- hgnc_name : str Official HGNC symbol for the gene. db_refs : dict db_refs grounding dictionary, used when constructing the Agent object. """ db_refs = {} if text_id != '-' and text_id is not None: db_refs['TEXT'] = text_id hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id) hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id is not None: db_refs['UP'] = up_id return (hgnc_name, db_refs)
def _download_biogrid_data(url): """Downloads zipped, tab-separated Biogrid data in .tab2 format. Parameters: ----------- url : str URL of the BioGrid zip file. Returns ------- csv.reader A csv.reader object for iterating over the rows (header has already been skipped). """ res = requests.get(biogrid_file_url) if res.status_code != 200: raise Exception('Unable to download Biogrid data: status code %s' % res.status_code) zip_bytes = BytesIO(res.content) zip_file = ZipFile(zip_bytes) zip_info_list = zip_file.infolist() # There should be only one file in this zip archive if len(zip_info_list) != 1: raise Exception('There should be exactly zipfile in BioGrid zip ' 'archive: %s' % str(zip_info_list)) unzipped_bytes = zip_file.read(zip_info_list[0]) # Unzip the file biogrid_str = StringIO(unzipped_bytes.decode('utf8')) # Make file-like obj csv_reader = csv.reader(biogrid_str, delimiter='\t') # Get csv reader next(csv_reader) # Skip the header return csv_reader