import re
import logging
from indra.databases import uniprot_client
from indra.statements import Agent, Complex, Evidence
from indra.ontology.standardize import standardize_agent_name
logger = logging.getLogger(__name__)
[docs]class VirhostnetProcessor:
"""A processor that takes a pandas DataFrame and extracts INDRA Statements.
Parameters
----------
df : pandas.DataFrame
A pandas DataFrame representing VirHostNet interactions.
Attributes
----------
df : pandas.DataFrame
A pandas DataFrame representing VirHostNet interactions.
statements : list[indra.statements.Statement]
A list of INDRA Statements extracted from the DataFrame.
"""
def __init__(self, df, up_web_fallback=False):
self.df = df
self.up_web_fallback = up_web_fallback
self.statements = []
def extract_statements(self):
for _, row in self.df.iterrows():
stmt = process_row(row, up_web_fallback=self.up_web_fallback)
if stmt:
self.statements.append(stmt)
[docs]def process_row(row, up_web_fallback=False):
"""Process one row of the DataFrame into an INDRA Statement."""
host_agent = get_agent_from_grounding(row['host_grounding'],
up_web_fallback=up_web_fallback)
vir_agent = get_agent_from_grounding(row['vir_grounding'],
up_web_fallback=up_web_fallback)
# There's a column that is always a - character
assert row['dash'] == '-', row['dash']
exp_method_id, exp_method_name = parse_psi_mi(row['exp_method'])
int_type__id, int_type_name = parse_psi_mi(row['int_type'])
assert row['host_tax'].startswith('taxid:'), row['host_tax']
_, host_tax = row['host_tax'].split(':')
assert row['vir_tax'].startswith('taxid:'), row['vir_tax']
_, vir_tax = row['vir_tax'].split(':')
assert row['score'].startswith('virhostnet-miscore:'), row['score']
_, score = row['score'].split(':')
score = float(score)
source_ids = parse_source_ids(row['source_id'])
annotations = {
'exp_method': {'id': exp_method_id, 'name': exp_method_name},
'int_type': {'id': int_type__id, 'name': int_type_name},
'host_tax': host_tax,
'vir_tax': vir_tax,
'score': score,
**source_ids,
}
text_refs = parse_text_refs(row['publication'])
ev = Evidence(source_api='virhostnet', annotations=annotations,
text_refs=text_refs, pmid=text_refs.get('PMID'),
source_id=source_ids.get('virhostnet-rid'))
stmt = Complex([host_agent, vir_agent], evidence=[ev])
return stmt
[docs]def get_agent_from_grounding(grounding, up_web_fallback=False):
"""Return an INDRA Agent based on a grounding annotation."""
db_ns, db_id = grounding.split(':')
# Assume UniProt or RefSeq IDs
assert db_ns in {'uniprotkb', 'refseq', 'ddbj/embl/genbank'}, db_ns
if db_ns == 'uniprotkb':
if '-' in db_id:
up_id, feat_id = db_id.split('-')
# Assume it's a feature ID
assert feat_id.startswith('PRO'), feat_id
db_refs = {'UP': up_id, 'UPPRO': feat_id}
else:
db_refs = {'UP': db_id}
elif db_ns == 'refseq':
db_refs = {'REFSEQ_PROT': db_id}
else:
db_refs = {'NCBIPROTEIN': db_id}
agent = Agent(db_id, db_refs=db_refs)
standardized = standardize_agent_name(agent)
if up_web_fallback:
# Handle special case of unreviewed UP entries
if not standardized and 'UP' in db_refs:
name = uniprot_client.get_gene_name(db_refs['UP'],
web_fallback=True)
if name:
agent.name = name
return agent
[docs]def parse_psi_mi(psi_mi_str):
"""Parse a PSI-MI annotation into an ID and name pair."""
# Example: psi-mi:"MI:0018"(two hybrid)
match = re.match(r'psi-mi:"(.+)"\((.+)\)', psi_mi_str)
mi_id, name = match.groups()
return mi_id, name
[docs]def parse_text_refs(text_ref_str):
"""Parse a text reference annotation into a text_refs dict."""
tr_ns, tr_id = text_ref_str.split(':')
assert tr_ns == 'pubmed', text_ref_str
if re.match(r'^\d+$', tr_id):
return {'PMID': tr_id}
else:
match = re.match(r'^https\(//doi.org/(.+)\)$', tr_id)
if not match:
logger.warning('Failed to parse text ref: %s' % text_ref_str)
return {}
doi = match.groups()[0]
return {'DOI': doi}
[docs]def parse_source_ids(source_id_str):
"""Parse VirHostNet source id annotations into a dict."""
ids = source_id_str.split('|')
assert len(ids) == 2
ids_dict = {id.split(':')[0]: id.split(':')[1] for id in ids}
return ids_dict