Source code for indra.sources.biogrid

import re
import csv
import tqdm
import logging
import requests
from io import BytesIO, StringIO
from zipfile import ZipFile
from collections import namedtuple
from indra.util import read_unicode_csv
from indra.statements import Agent, Complex, Evidence
from indra.ontology.standardize import standardize_name_db_refs

logger = logging.getLogger(__name__)


biogrid_file_url = ('https://downloads.thebiogrid.org/Download/BioGRID/'
                    'Latest-Release/BIOGRID-ALL-LATEST.tab3.zip')


# The explanation for each column of the tsv file is here:
# https://wiki.thebiogrid.org/doku.php/biogrid_tab_version_3.0
columns = ['biogrid_int_id',
           'entrez_a', 'entrez_b',
           'biogrid_a', 'biogrid_b',
           'syst_name_a', 'syst_name_b',
           'symbol_a', 'symbol_b',
           'syn_a', 'syn_b',
           'exp_system', 'exp_system_type',
           'author', 'publication',
           'organism_a', 'organism_b',
           'throughput', 'score', 'modification',
           'qualifications', 'tags', 'source_db',
           'swissprot_a', 'trembl_a', 'refseq_a',
           'swissprot_b', 'trembl_b', 'refseq_b']
_BiogridRow = namedtuple('BiogridRow', columns)


[docs]class BiogridProcessor(object): """Extracts INDRA Complex statements from Biogrid interaction data. Parameters ---------- biogrid_file : str The file containing the Biogrid data in .tab2 format. If not provided, the BioGrid data is downloaded from the BioGrid website. physical_only : boolean If True, only physical interactions are included (e.g., genetic interactions are excluded). If False, all interactions are included). Attributes ---------- statements : list[indra.statements.Statements] Extracted INDRA Complex statements. physical_only : boolean Indicates whether only physical interactions were included during statement processing. """ def __init__(self, biogrid_file=None, physical_only=True): self.statements = [] self.physical_only = physical_only # If a path to the file is included, process it, skipping the header if biogrid_file: rows = read_unicode_csv(biogrid_file, '\t', skiprows=1) # If no file is provided, download from web else: logger.info('No data file specified, downloading from BioGrid ' 'at %s' % biogrid_file_url) rows = _download_biogrid_data(biogrid_file_url) # Process the rows into Statements for row in tqdm.tqdm(rows, desc='Processing BioGRID rows'): # There are some extra columns that we don't need to take and # thereby save space in annotations filt_row = [None if item == '-' else item for item in row][:len(columns)] bg_row = _BiogridRow(*filt_row) # Filter out non-physical interactions if desired if self.physical_only and bg_row.exp_system_type != 'physical': continue # Ground agents agent_a = self._make_agent(bg_row.symbol_a, bg_row.entrez_a, bg_row.swissprot_a, bg_row.trembl_a) agent_b = self._make_agent(bg_row.symbol_b, bg_row.entrez_b, bg_row.swissprot_b, bg_row.trembl_b) # Skip any agents with neither HGNC grounding or string name if agent_a is None or agent_b is None: continue # Get evidence pmid_match = re.match(r'PUBMED:(\d+)', bg_row.publication) doi_match = re.match(r'DOI:(.*)', bg_row.publication) text_refs = {} if pmid_match: text_refs['PMID'] = pmid_match.groups()[0] elif doi_match: text_refs['DOI'] = doi_match.groups()[0] ev = Evidence(source_api='biogrid', source_id=bg_row.biogrid_int_id, pmid=text_refs.get('PMID'), text_refs=text_refs, annotations=dict(bg_row._asdict())) # Make statement s = Complex([agent_a, agent_b], evidence=ev) self.statements.append(s) def _make_agent(self, symbol, entrez_id, swissprot_id, trembl_id): """Make an Agent object, appropriately grounded. Parameters ---------- entrez_id : str Entrez id number swissprot_id : str Swissprot (reviewed UniProt) ID. trembl_id : str Trembl (unreviewed UniProt) ID. symbol : str A plain text symbol, or None if not listed. Returns ------- agent : indra.statements.Agent A grounded agent object. """ db_refs = {} name = symbol if swissprot_id: if '|' not in swissprot_id: db_refs['UP'] = swissprot_id elif trembl_id: if '|' not in trembl_id: db_refs['UP'] = trembl_id if entrez_id: db_refs['EGID'] = entrez_id standard_name, db_refs = standardize_name_db_refs(db_refs) if standard_name: name = standard_name # At the time of writing this, the name was never None but # just in case if name is None: return None return Agent(name, db_refs=db_refs)
def _download_biogrid_data(url): """Downloads zipped, tab-separated Biogrid data in .tab2 format. Parameters: ----------- url : str URL of the BioGrid zip file. Returns ------- csv.reader A csv.reader object for iterating over the rows (header has already been skipped). """ res = requests.get(biogrid_file_url) if res.status_code != 200: raise Exception('Unable to download Biogrid data: status code %s' % res.status_code) zip_bytes = BytesIO(res.content) zip_file = ZipFile(zip_bytes) zip_info_list = zip_file.infolist() # There should be only one file in this zip archive if len(zip_info_list) != 1: raise Exception('There should be exactly zipfile in BioGrid zip ' 'archive: %s' % str(zip_info_list)) unzipped_bytes = zip_file.read(zip_info_list[0]) # Unzip the file biogrid_str = StringIO(unzipped_bytes.decode('utf8')) # Make file-like obj csv_reader = csv.reader(biogrid_str, delimiter='\t') # Get csv reader next(csv_reader) # Skip the header return csv_reader