import re
import csv
import tqdm
import logging
import requests
from io import BytesIO, StringIO
from zipfile import ZipFile
from collections import namedtuple
from indra.util import read_unicode_csv
from indra.statements import Agent, Complex, Evidence
from indra.ontology.standardize import standardize_name_db_refs
logger = logging.getLogger(__name__)
biogrid_file_url = ('https://downloads.thebiogrid.org/Download/BioGRID/'
'Latest-Release/BIOGRID-ALL-LATEST.tab3.zip')
# The explanation for each column of the tsv file is here:
# https://wiki.thebiogrid.org/doku.php/biogrid_tab_version_3.0
columns = ['biogrid_int_id',
'entrez_a', 'entrez_b',
'biogrid_a', 'biogrid_b',
'syst_name_a', 'syst_name_b',
'symbol_a', 'symbol_b',
'syn_a', 'syn_b',
'exp_system', 'exp_system_type',
'author', 'publication',
'organism_a', 'organism_b',
'throughput', 'score', 'modification',
'qualifications', 'tags', 'source_db',
'swissprot_a', 'trembl_a', 'refseq_a',
'swissprot_b', 'trembl_b', 'refseq_b']
_BiogridRow = namedtuple('BiogridRow', columns)
[docs]class BiogridProcessor(object):
"""Extracts INDRA Complex statements from Biogrid interaction data.
Parameters
----------
biogrid_file : str
The file containing the Biogrid data in .tab2 format. If not provided,
the BioGrid data is downloaded from the BioGrid website.
physical_only : boolean
If True, only physical interactions are included (e.g., genetic
interactions are excluded). If False, all interactions are included).
Attributes
----------
statements : list[indra.statements.Statements]
Extracted INDRA Complex statements.
physical_only : boolean
Indicates whether only physical interactions were included during
statement processing.
"""
def __init__(self, biogrid_file=None, physical_only=True):
self.statements = []
self.physical_only = physical_only
# If a path to the file is included, process it, skipping the header
if biogrid_file:
rows = read_unicode_csv(biogrid_file, '\t', skiprows=1)
# If no file is provided, download from web
else:
logger.info('No data file specified, downloading from BioGrid '
'at %s' % biogrid_file_url)
rows = _download_biogrid_data(biogrid_file_url)
# Process the rows into Statements
for row in tqdm.tqdm(rows, desc='Processing BioGRID rows'):
# There are some extra columns that we don't need to take and
# thereby save space in annotations
filt_row = [None if item == '-' else item
for item in row][:len(columns)]
bg_row = _BiogridRow(*filt_row)
# Filter out non-physical interactions if desired
if self.physical_only and bg_row.exp_system_type != 'physical':
continue
# Ground agents
agent_a = self._make_agent(bg_row.symbol_a, bg_row.entrez_a,
bg_row.swissprot_a, bg_row.trembl_a)
agent_b = self._make_agent(bg_row.symbol_b, bg_row.entrez_b,
bg_row.swissprot_b, bg_row.trembl_b)
# Skip any agents with neither HGNC grounding or string name
if agent_a is None or agent_b is None:
continue
# Get evidence
pmid_match = re.match(r'PUBMED:(\d+)',
bg_row.publication)
doi_match = re.match(r'DOI:(.*)', bg_row.publication)
text_refs = {}
if pmid_match:
text_refs['PMID'] = pmid_match.groups()[0]
elif doi_match:
text_refs['DOI'] = doi_match.groups()[0]
ev = Evidence(source_api='biogrid',
source_id=bg_row.biogrid_int_id,
pmid=text_refs.get('PMID'),
text_refs=text_refs,
annotations=dict(bg_row._asdict()))
# Make statement
s = Complex([agent_a, agent_b], evidence=ev)
self.statements.append(s)
def _make_agent(self, symbol, entrez_id, swissprot_id, trembl_id):
"""Make an Agent object, appropriately grounded.
Parameters
----------
entrez_id : str
Entrez id number
swissprot_id : str
Swissprot (reviewed UniProt) ID.
trembl_id : str
Trembl (unreviewed UniProt) ID.
symbol : str
A plain text symbol, or None if not listed.
Returns
-------
agent : indra.statements.Agent
A grounded agent object.
"""
db_refs = {}
name = symbol
if swissprot_id:
if '|' not in swissprot_id:
db_refs['UP'] = swissprot_id
elif trembl_id:
if '|' not in trembl_id:
db_refs['UP'] = trembl_id
if entrez_id:
db_refs['EGID'] = entrez_id
standard_name, db_refs = standardize_name_db_refs(db_refs)
if standard_name:
name = standard_name
# At the time of writing this, the name was never None but
# just in case
if name is None:
return None
return Agent(name, db_refs=db_refs)
def _download_biogrid_data(url):
"""Downloads zipped, tab-separated Biogrid data in .tab2 format.
Parameters:
-----------
url : str
URL of the BioGrid zip file.
Returns
-------
csv.reader
A csv.reader object for iterating over the rows (header has already
been skipped).
"""
res = requests.get(biogrid_file_url)
if res.status_code != 200:
raise Exception('Unable to download Biogrid data: status code %s'
% res.status_code)
zip_bytes = BytesIO(res.content)
zip_file = ZipFile(zip_bytes)
zip_info_list = zip_file.infolist()
# There should be only one file in this zip archive
if len(zip_info_list) != 1:
raise Exception('There should be exactly zipfile in BioGrid zip '
'archive: %s' % str(zip_info_list))
unzipped_bytes = zip_file.read(zip_info_list[0]) # Unzip the file
biogrid_str = StringIO(unzipped_bytes.decode('utf8')) # Make file-like obj
csv_reader = csv.reader(biogrid_str, delimiter='\t') # Get csv reader
next(csv_reader) # Skip the header
return csv_reader