Source code for indra.sources.ubibrowser.processor

from indra.statements import *
from indra.ontology.standardize import get_standard_agent


[docs]class UbiBrowserProcessor:
    """Processor for UbiBrowser data."""
    def __init__(self, e3_df, dub_df):
        self.e3_df = e3_df
        self.dub_df = dub_df
        self.statements = []

    def extract_statements(self):
        for df, stmt_type, subj_suffix in \
                [(self.e3_df, Ubiquitination, 'E3'),
                 (self.dub_df, Deubiquitination, 'DUB')]:
            for _, row in df.iterrows():
                stmt = self._process_row(row, stmt_type, subj_suffix)
                if stmt:
                    self.statements.append(stmt)

    @staticmethod
    def _process_row(row, stmt_type, subj_suffix):
        # Note that even in the DUB table the subject of the statement
        # is called "E3"
        # There are some examples where a complex is implied (e.g., BMI1-RNF2),
        # for simplicity we just ignore these
        if '#' in row[f'SwissProt AC ({subj_suffix})']:
            return None
        # Interestingly, some of the E3s are missing entirely, we skip these
        elif row[f'SwissProt AC ({subj_suffix})'] == '-':
            return None
        # Some of the same corner cases apply to the substrate as well
        if row['SwissProt AC (Substrate)'] == '-':
            return None
        subj_agent = \
            get_standard_agent(row[f'Gene Symbol ({subj_suffix})'],
                               {'UP': row[f'SwissProt AC ({subj_suffix})']})
        obj_agent = get_standard_agent(row['Gene Symbol (Substrate)'],
                                       {'UP': row['SwissProt AC (Substrate)']})
        if row['SOURCE'] == 'MEDLINE' and row['SOURCEID'] != 'UNIPROT':
            # Note: we sometimes get int here
            pmid = str(row['SOURCEID'])
            text = row['SENTENCE']
        else:
            pmid = None
            text = None
        ev = Evidence(source_api='ubibrowser', pmid=pmid, text=text)
        stmt = stmt_type(subj_agent, obj_agent, evidence=[ev])
        return stmt