Source code for indra.sources.gnbr.processor

"""This module contains the processor for GNBR. There are several, each
corresponding to different kinds of interactions."""
import re
import itertools as it
from typing import List
from copy import deepcopy
import pandas as pd
from indra.statements import *
from indra.databases import mesh_client
from indra.ontology.bio import bio_ontology
from indra.ontology.standardize import get_standard_agent


gene_gene_stmt_mappings = {
    'V+': Activation,
    'E+': IncreaseAmount,
    'Q':  IncreaseAmount,
    'H':  Complex
}

chem_gene_stmt_mappings = {
    'A+': Activation,
    'A-': Inhibition,
    'N':  Inhibition,
    'B':  Complex,
    'E-': DecreaseAmount
}

gene_disease_stmt_mappings = {
    'Te': Inhibition,
    'G':  Activation
}

chem_disease_stmt_mappings = {
    'T':  Inhibition,
    'C':  Inhibition,
    'Pr': Inhibition,
    'Pa': Inhibition
}


cheby_pattern = re.compile(r'^CHEBI:(\d+)$')

mesh_pattern = re.compile(r'^MESH:([CD]\d+)$')
mesh_no_prefix_pattern = re.compile(r'^[CD]\d+$')

entrez_pattern = re.compile(r'^(\d+)$')
entrez_with_tax_pattern = re.compile(r'^(\d+)\(Tax:(\d+)\)$')

omim_pattern = re.compile(r'^OMIM:(\d+)$')
omim_no_prefix_pattern = re.compile(r'^(\d+)$')


[docs]class GnbrProcessor: """A processor for interactions in the GNBR dataset. Parameters ---------- df1 : Dataframe of dependency paths and themes. df2 : Dataframe of dependency paths and agents. first_type : The type of the first entity in the data frame. second_type : The type of the second entity in the data frame. """ def __init__(self, df1: pd.DataFrame, df2: pd.DataFrame, first_type: str, second_type: str, indicator_only: bool = True) -> None: self.df1 = df1 self.df2 = df2 self.df2.columns = ['id', 'sentence_num', 'nm_1_form', 'nm_1_loc', 'nm_2_form', 'nm_2_loc', 'nm_1_raw', 'nm_2_raw', 'nm_1_dbid', 'nm_2_dbid', '1_type', '2_type', 'path', 'sentence'] self.df2['path'] = df2['path'].str.lower() self.first_type = first_type self.second_type = second_type self.indicator_only = indicator_only self.statements = []
[docs] def extract_stmts(self): """Extend the statements list with mappings.""" if self.first_type == 'gene' and self.second_type == 'gene': statement_mappings = gene_gene_stmt_mappings elif self.first_type == 'chemical' and self.second_type == 'gene': statement_mappings = chem_gene_stmt_mappings elif self.first_type == 'gene' and self.second_type == 'disease': statement_mappings = gene_disease_stmt_mappings else: statement_mappings = chem_disease_stmt_mappings for rel_type, stmt_type in statement_mappings.items(): constraint = (self.df1[rel_type] > 0) if self.indicator_only: constraint &= (self.df1['%s.ind' % rel_type] == 1) df_part = self.df1[constraint] self.statements.extend(self._extract_stmts_by_class(df_part, stmt_type))
def _extract_stmts_by_class(self, df, stmt_class): """Make a given class of Statements from a subset of the dataframe. Parameters ---------- df : Filtered dataframe to one particular relationship theme. stmt_class : Statement type matched to the type of the filtered dataframe. Yields ------ stmt : Statements produced from the dataframes. """ df_joint = df.join(self.df2.set_index('path'), on='path') for index, row in df_joint.iterrows(): if self.first_type == 'gene': first_agents = get_std_gene(row['nm_1_raw'], row['nm_1_dbid']) else: first_agents = get_std_chemical(row['nm_1_raw'], row['nm_1_dbid']) if self.second_type == 'gene': second_agents = get_std_gene(row['nm_2_raw'], row['nm_2_dbid']) else: second_agents = get_std_disease(row['nm_2_raw'], row['nm_2_dbid']) evidence = get_evidence(row) for first_agent, second_agent in it.product(first_agents, second_agents): if stmt_class == Complex: stmt = stmt_class([first_agent, second_agent], evidence=deepcopy(evidence)) else: stmt = stmt_class(first_agent, second_agent, evidence=deepcopy(evidence)) yield stmt
[docs]def get_std_gene(raw_string: str, db_id: str) -> List[Agent]: """Standardize gene names. Parameters ---------- raw_string : Name of the agent in the GNBR dataset. db_id : Entrez identifier of the agent. Returns ------- : A standardized Agent object. """ # If neither a name nor a DB ID is given, we return empty if pd.isna(db_id) and pd.isna(raw_string): return [] # We add TEXT to db_refs if there is a raw_string db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {} # In this case we know that there is no db_id but we have raw_string that # we can use as a name and we return with that agent if pd.isna(db_id): return [Agent(raw_string, db_refs=db_refs)] # Otherwise we have a db_id that we can process else: agents = [] for single_db_id in db_id.split(';'): single_db_refs = deepcopy(db_refs) name = raw_string if not pd.isna(raw_string) else single_db_id if entrez_pattern.match(single_db_id): single_db_refs['EGID'] = single_db_id else: match = entrez_with_tax_pattern.match(single_db_id) if not match: raise ValueError('Unexpected gene identifier: %s' % single_db_id) single_db_refs['EGID'] = match.groups()[0] agents.append(get_standard_agent(name, single_db_refs)) return agents
[docs]def get_std_chemical(raw_string: str, db_id: str) -> List[Agent]: """Standardize chemical names. Parameters ---------- raw_string : Name of the agent in the GNBR dataset. db_id : Entrez identifier of the agent. Returns ------- : A standardized Agent object. """ # If neither a name nor a DB ID is given, we return empty if pd.isna(db_id) and pd.isna(raw_string): return [] # We add TEXT to db_refs if there is a raw_string db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {} # In this case we know that there is no db_id but we have raw_string that # we can use as a name and we return with that agent if pd.isna(db_id): return [Agent(raw_string, db_refs=db_refs)] # Otherwise we have a db_id that we can process else: agents = [] for single_db_id in db_id.split('|'): single_db_refs = deepcopy(db_refs) name = raw_string if not pd.isna(raw_string) else single_db_id if cheby_pattern.match(single_db_id): single_db_refs['CHEBI'] = single_db_id elif mesh_pattern.match(single_db_id): mesh_id = single_db_id[5:] # There are often non-existent MESH IDs here for some reason # that can be filtered out with this technique if not mesh_client.get_mesh_name(mesh_id, offline=True): continue single_db_refs['MESH'] = mesh_id elif mesh_no_prefix_pattern.match(single_db_id): mesh_id = single_db_id # There are often non-existent MESH IDs here for some reason # that can be filtered out with this technique if not mesh_client.get_mesh_name(mesh_id, offline=True): continue single_db_refs['MESH'] = single_db_id else: raise ValueError('Unexpected chemical identifier: %s' % single_db_id) agents.append(get_standard_agent(name, single_db_refs)) return agents
[docs]def get_std_disease(raw_string: str, db_id: str) -> List[Agent]: """Standardize disease names. Parameters ---------- raw_string : Name of the agent in the GNBR dataset. db_id : Entrez identifier of the agent. Returns ------- : A standardized Agent object. """ agents = [] db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {} name = raw_string if not pd.isna(raw_string) else db_id if pd.isna(db_id): pass elif omim_no_prefix_pattern.match(db_id): db_refs['OMIM'] = db_id elif omim_pattern.match(db_id): db_refs['OMIM'] = db_id[5:] elif mesh_no_prefix_pattern.match(db_id): db_refs['MESH'] = db_id elif mesh_pattern.match(db_id): db_refs['MESH'] = db_id[5:] else: raise ValueError('Unexpected disease identifier: %s' % db_id) agents.append(get_standard_agent(name, db_refs)) return agents
[docs]def get_evidence(row: pd.Series) -> Evidence: """Return evidence for a Statement. Parameters ---------- row : Currently investigated row of the dataframe. Returns ------- : Evidence object with the source_api, the PMID and the original sentence. """ pmid = str(row['id']) if row['id'] else None evidence = Evidence(source_api='gnbr', pmid=pmid, text=row['sentence'], text_refs={'PMID': pmid}) return evidence