Source code for indra.assemblers.index_card.assembler

import json
import logging
from indra.statements import *
from indra.literature import id_lookup
from indra.databases import hgnc_client, uniprot_client, chebi_client, \
    go_client

logger = logging.getLogger(__name__)

global_submitter = 'indra'


[docs]class IndexCardAssembler(object):
    """Assembler creating index cards from a set of INDRA Statements.

    Parameters
    ----------
    statements : list
        A list of INDRA statements to be assembled.
    pmc_override : Optional[str]
        A PMC ID to assign to the index card.

    Attributes
    ----------
    statements : list
        A list of INDRA statements to be assembled.
    """

    def __init__(self, statements=None, pmc_override=None):
        if statements is None:
            self.statements = []
        else:
            self.statements = statements
        self.cards = []
        self.pmc_override = pmc_override

[docs]    def add_statements(self, statements):
        """Add statements to the assembler.

        Parameters
        ----------
        statements : list[indra.statement.Statements]
            The list of Statements to add to the assembler.
        """
        self.statements.extend(statements)

[docs]    def make_model(self):
        """Assemble statements into index cards."""
        for stmt in self.statements:
            card = self.assemble_one_card(stmt, self.pmc_override)
            if card is not None:
                self.cards.append(card)
        return self.cards

    @staticmethod
    def assemble_one_card(stmt, pmc_override=None):
        if isinstance(stmt, Modification):
            card = assemble_modification(stmt)
        elif isinstance(stmt, SelfModification):
            card = assemble_selfmodification(stmt)
        elif isinstance(stmt, Complex):
            card = assemble_complex(stmt)
        elif isinstance(stmt, Translocation):
            card = assemble_translocation(stmt)
        elif isinstance(stmt, RegulateActivity):
            card = assemble_regulate_activity(stmt)
        elif isinstance(stmt, RegulateAmount):
            card = assemble_regulate_amount(stmt)
        else:
            return None
        if card is not None:
            card.card['meta'] = {'id': stmt.uuid, 'belief': stmt.belief}
            ev_info = get_evidence_info(stmt)
            card.card['interaction']['hypothesis_information'] = \
                ev_info['hypothesis']
            card.card['interaction']['context'] = ev_info['context']
            card.card['evidence'] = ev_info['text']
            card.card['submitter'] = global_submitter
            if pmc_override is not None:
                card.card['pmc_id'] = pmc_override
            else:
                card.card['pmc_id'] = get_pmc_id(stmt)
        return card

[docs]    def print_model(self):
        """Return the assembled cards as a JSON string.

        Returns
        -------
        cards_json : str
            The JSON string representing the assembled cards.
        """
        cards = [c.card for c in self.cards]
        # If there is only one card, print it as a single
        # card not as a list
        if len(cards) == 1:
            cards = cards[0]
        cards_json = json.dumps(cards, indent=1)
        return cards_json

[docs]    def save_model(self, file_name='index_cards.json'):
        """Save the assembled cards into a file.

        Parameters
        ----------
        file_name : Optional[str]
            The name of the file to save the cards into. Default:
            index_cards.json
        """
        with open(file_name, 'wt') as fh:
            fh.write(self.print_model())


class IndexCard(object):
    def __init__(self):
        self.card  = {
            'pmc_id': None,
            'submitter': None,
            'interaction': {
                'negative_information': False,
                'hypothesis_information' : None,
                'interaction_type': None,
                'participant_a': {
                    'entity_type': None,
                    'entity_text': None,
                    'identifier': None
                    },
                'participant_b': {
                    'entity_type': None,
                    'entity_text': None,
                    'identifier': None
                    }
                }
            }

    def get_string(self):
        return json.dumps(self.card)


def assemble_complex(stmt):
    card = IndexCard()
    card.card['interaction']['interaction_type'] = 'complexes_with'
    card.card['interaction'].pop('participant_b', None)
    # NOTE: fill out entity_text
    card.card['interaction']['participant_a']['entity_type'] = 'complex'
    card.card['interaction']['participant_a']['entity_text'] = ['']
    card.card['interaction']['participant_a'].pop('identifier', None)
    card.card['interaction']['participant_a']['entities'] = []
    for m in stmt.members:
        p = get_participant(m)
        card.card['interaction']['participant_a']['entities'].append(p)
    return card


def assemble_regulate_activity(stmt):
    # Top level card
    card = IndexCard()
    int_type = ('increases' if stmt.is_activation else 'decreases')
    card.card['interaction']['interaction_type'] = int_type
    card.card['interaction']['participant_a'] = get_participant(stmt.subj)
    # Embedded interaction
    interaction = {}
    interaction['negative_information'] = False
    interaction['participant_a'] = get_participant(stmt.obj)
    if stmt.obj_activity == 'kinase':
        interaction['participant_b'] = get_generic('protein')
        interaction['interaction_type'] = 'adds_modification'
        interaction['modifications'] = [{
            'feature_type': 'modification_feature',
            'modification_type': 'phosphorylation',
            }]
        card.card['interaction']['participant_b'] = interaction
    elif stmt.obj_activity == 'transcription':
        interaction['participant_b'] = get_generic('gene')
        interaction['interaction_type'] = 'increases'
        card.card['interaction']['participant_b'] = interaction
    else:
        return None
    return card


def assemble_regulate_amount(stmt):
    # Top level card
    card = IndexCard()
    if isinstance(stmt, IncreaseAmount):
        int_type = 'increases'
    else:
        int_type = 'decreases'
    card.card['interaction']['interaction_type'] = int_type
    card.card['interaction']['participant_a'] = get_participant(stmt.subj)
    card.card['interaction']['participant_b'] = get_participant(stmt.obj)
    return card


def assemble_modification(stmt):
    card = IndexCard()

    mod_type = modclass_to_modtype[stmt.__class__]
    interaction = {}
    interaction['negative_information'] = False
    if isinstance(stmt, RemoveModification):
        interaction['interaction_type'] = 'removes_modification'
        mod_type = modtype_to_inverse[mod_type]
    else:
        interaction['interaction_type'] = 'adds_modification'

    interaction['modifications'] = [{
                'feature_type': 'modification_feature',
                'modification_type': mod_type,
                }]
    if stmt.position is not None:
        pos = int(stmt.position)
        interaction['modifications'][0]['location'] = pos
    if stmt.residue is not None:
        interaction['modifications'][0]['aa_code'] =  stmt.residue

    # If the statement is direct or there is no enzyme
    if _get_is_direct(stmt) or stmt.enz is None:
        interaction['participant_a'] = get_participant(stmt.enz)
        interaction['participant_b'] = get_participant(stmt.sub)
        card.card['interaction'] = interaction
    # If the statement is indirect, we generate an index card:
    # SUB increases (GENERIC adds_modification ENZ)
    else:
        interaction['participant_a'] = get_participant(None)
        interaction['participant_b'] = get_participant(stmt.sub)
        card.card['interaction']['interaction_type'] = 'increases'
        card.card['interaction']['negative_information'] = False
        card.card['interaction']['participant_a'] = get_participant(stmt.enz)
        card.card['interaction']['participant_b'] = interaction

    return card


def assemble_selfmodification(stmt):
    card = IndexCard()

    mod_type = stmt.__class__.__name__.lower()
    if mod_type.endswith('phosphorylation'):
        mod_type = 'phosphorylation'
    else:
        return None

    interaction = {}
    interaction['negative_information'] = False
    interaction['interaction_type'] = 'adds_modification'

    interaction['modifications'] = [{
                'feature_type': 'modification_feature',
                'modification_type': mod_type,
                }]
    if stmt.position is not None:
        pos = int(stmt.position)
        interaction['modifications'][0]['location'] = pos
    if stmt.residue is not None:
        interaction['modifications'][0]['aa_code'] =  stmt.residue

    # If the statement is direct or there is no enzyme
    if _get_is_direct(stmt) or stmt.enz is None:
        interaction['participant_a'] = get_participant(stmt.enz)
        interaction['participant_b'] = get_participant(stmt.enz)
        card.card['interaction'] = interaction

    return card


def assemble_translocation(stmt):
    # Index cards don't allow missing to_location
    if stmt.to_location is None:
        return None
    card = IndexCard()
    interaction = {}
    interaction['negative_information'] = False
    interaction['interaction_type'] = 'translocates'
    if stmt.from_location is not None:
        interaction['from_location_text'] = stmt.from_location
        from_loc_id = \
            go_client.get_go_id_from_label_or_synonym(stmt.from_location)
        interaction['from_location_id'] = from_loc_id
    interaction['to_location_text'] = stmt.to_location
    to_loc_id = \
        go_client.get_go_id_from_label_or_synonym(stmt.to_location)
    interaction['to_location_id'] = to_loc_id
    interaction['participant_a'] = get_participant(None)
    interaction['participant_b'] = get_participant(stmt.agent)
    card.card['interaction'] = interaction
    return card


def get_generic(entity_type='protein'):
    participant = {
        'entity_text': [''],
        'entity_type': entity_type,
        'identifier': 'GENERIC'
        }
    return participant


def get_participant(agent):
    # Handle missing Agent as generic protein
    if agent is None:
        return get_generic('protein')
    # The Agent is not missing
    text_name = agent.db_refs.get('TEXT')
    if text_name is None:
        text_name = agent.name
    participant = {}
    participant['entity_text'] = [text_name]
    hgnc_id = agent.db_refs.get('HGNC')
    uniprot_id = agent.db_refs.get('UP')
    chebi_id = agent.db_refs.get('CHEBI')
    pfam_def_ids = agent.db_refs.get('PFAM-DEF')
    # If HGNC grounding is available, that is the first choice
    if hgnc_id:
        uniprot_id = hgnc_client.get_uniprot_id(hgnc_id)
    if uniprot_id:
        uniprot_mnemonic = str(uniprot_client.get_mnemonic(uniprot_id))
        participant['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
        participant['entity_type'] = 'protein'
    elif chebi_id:
        pubchem_id = chebi_client.get_pubchem_id(chebi_id)
        participant['identifier'] = 'PUBCHEM:%s' % pubchem_id
        participant['entity_type'] = 'chemical'
    elif pfam_def_ids:
        participant['entity_type'] = 'protein_family'
        participant['entities'] = []
        pfam_def_list = []
        for p in pfam_def_ids.split('|'):
            dbname, dbid = p.split(':')
            pfam_def_list.append({dbname: dbid})
        for pdi in pfam_def_list:
            # TODO: handle non-uniprot protein IDs here
            uniprot_id = pdi.get('UP')
            if uniprot_id:
                entity_dict = {}
                uniprot_mnemonic = \
                    str(uniprot_client.get_mnemonic(uniprot_id))
                gene_name = uniprot_client.get_gene_name(uniprot_id)
                if gene_name is None:
                    gene_name = ""
                entity_dict['entity_text'] = [gene_name]
                entity_dict['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic
                entity_dict['entity_type'] = 'protein'
                participant['entities'].append(entity_dict)
    else:
        participant['identifier'] = ''
        participant['entity_type'] = 'protein'

    features = []
    not_features = []
    # Binding features
    for bc in agent.bound_conditions:
        feature = {
            'feature_type': 'binding_feature',
            'bound_to': {
                # NOTE: get type and identifier for bound to protein
                'entity_type': 'protein',
                'entity_text': [bc.agent.name],
                'identifier': ''
                }
            }
        if bc.is_bound:
            features.append(feature)
        else:
            not_features.append(feature)
    # Modification features
    for mc in agent.mods:
        feature = {
            'feature_type': 'modification_feature',
            'modification_type': mc.mod_type.lower(),
            }
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        if mc.residue is not None:
            feature['aa_code'] = mc.residue
        if mc.is_modified:
            features.append(feature)
        else:
            not_features.append(feature)
    # Mutation features
    for mc in agent.mutations:
        feature = {}
        feature['feature_type'] = 'mutation_feature'
        if mc.residue_from is not None:
            feature['from_aa'] = mc.residue_from
        if mc.residue_to is not None:
            feature['to_aa'] = mc.residue_to
        if mc.position is not None:
            pos = int(mc.position)
            feature['location'] = pos
        features.append(feature)
    if features:
        participant['features'] = features
    if not_features:
        participant['not_features'] = not_features
    return participant


def get_pmc_id(stmt):
    pmc_id = ''
    for ev in stmt.evidence:
        pmc_id = id_lookup(ev.pmid, 'pmid')['pmcid']
        if pmc_id is not None:
            if not pmc_id.startswith('PMC'):
                pmc_id = 'PMC' + pmc_id
        else:
            pmc_id = ''
    return str(pmc_id)


def get_evidence_info(stmt):
    ev_txts = []
    contexts = []
    hypotheses = []
    evs = (('', stmt.evidence),
           ('PARTIAL: ', ([] if not hasattr(stmt, 'partial_evidence')
                          else stmt.partial_evidence)))
    for prefix, ev_list in evs:
        for ev in ev_list:
            if ev.text is None:
                ev_txts.append(
                    '%sEvidence text not available for %s entry: %s' %
                    (prefix, ev.source_api, ev.source_id))
            else:
                ev_txts.append('%s%s' % (prefix, ev.text))

            if ev.context and ev.context.species:
                species = ev.context.species
                obj = {}
                obj['name'] = species.name
                obj['taxonomy'] = species.db_refs.get('TAXONOMY') \
                    if species.db_refs is not None else None
            else:
                obj = None
            contexts.append(obj)

            hypothesis = ev.epistemics.get('hypothesis')
            hypotheses.append(hypothesis)
    return {'text': ev_txts,
            'context': contexts,
            'hypothesis': hypotheses}


def _get_is_direct(stmt):
    """Returns true if there is evidence that the statement is a direct
    interaction. If any of the evidences associated with the statement
    indicates a direct interatcion then we assume the interaction
    is direct. If there is no evidence for the interaction being indirect
    then we default to direct."""
    any_indirect = False
    for ev in stmt.evidence:
        if ev.epistemics.get('direct') is True:
            return True
        elif ev.epistemics.get('direct') is False:
            # This guarantees that we have seen at least
            # some evidence that the statement is indirect
            any_indirect = True
    if any_indirect:
        return False
    return True