Source code for indra.sources.rlimsp.processor

import logging
import tqdm
from collections import Counter
from indra.statements.validate import assert_valid_statements
from indra.databases import hgnc_client, uniprot_client
from indra.statements import Agent, Phosphorylation, Autophosphorylation, \
    Evidence, BioContext, RefContext, get_valid_residue, \
    InvalidResidueError, MutCondition

logger = logging.getLogger(__name__)


[docs]class RlimspProcessor(object): """Convert RLIMS-P JSON into INDRA Statements.""" def __init__(self, rlimsp_json, doc_id_type=None): self._json = rlimsp_json self.statements = [] self.doc_id_type = doc_id_type self.processed_texts = [] return
[docs] def extract_statements(self): """Extract the statements from the json.""" for p_info in tqdm.tqdm(self._json, desc='Processing RLIMS-P JSON'): para = RlimspParagraph(p_info, self.doc_id_type) if para._text not in self.processed_texts: self.processed_texts.append(para._text) stmts = para.get_statements() assert_valid_statements(stmts) self.statements.extend(stmts) return
[docs]class RlimspParagraph(object): """An object that represents a single RLIMS-P Paragraph.""" def __init__(self, p_info, doc_id_type): self._text = p_info['text'] self._sentences = [] self._sentence_starts = [] for s in p_info['sentence']: start = s['charStart'] stop = s['charEnd'] self._sentences.append(self._text[start:stop]) self._sentence_starts.append(start) if 'pmid' in p_info and 'pmcid' in p_info: self._text_refs = {n.upper(): p_info[n] for n in ['pmid', 'pmcid'] if p_info[n]} elif doc_id_type: self._text_refs = {doc_id_type.upper(): p_info['docId']} else: logger.info('Could not establish text refs for paragraph.') self._text_refs = {} self._relations = p_info['relation'] self._entity_dict = p_info['entity'] return def _get_agent(self, entity_id): """Convert the entity dictionary into an INDRA Agent.""" if entity_id is None: return None entity_info = self._entity_dict.get(entity_id) if entity_info is None: logger.warning("Entity key did not resolve to entity.") return None return get_agent_from_entity_info(entity_info) def _get_site(self, site_id): def get_aa_code(residue_str): try: res = get_valid_residue(residue_str) return res except InvalidResidueError as e: logger.info('%s' % e) return None if site_id is None: return None, None, None site_info = self._entity_dict[site_id] site_text = site_info['attribute'][0]['value'] site_parts = site_text.split('-') position = None if len(site_parts) == 2: residue_str = site_parts[0] residue = get_aa_code(residue_str) position = site_parts[1] else: residue = get_aa_code(site_text) coords = (site_info['charStart'], site_info['charEnd']) return residue, position, coords def _get_evidence(self, trigger_info, args, agent_coords, site_coords): """Get the evidence using the info in the trigger entity.""" # Get the sentence index from the trigger word. s_idx_set = {self._entity_dict[eid]['sentenceIndex'] for eid in args.values() if 'sentenceIndex' in self._entity_dict[eid]} if s_idx_set: i_min = min(s_idx_set) i_max = max(s_idx_set) text = '. '.join(self._sentences[i_min:(i_max+1)]) + '.' s_start = self._sentence_starts[i_min] annotations = { 'agents': {'coords': [_fix_coords(coords, s_start) for coords in agent_coords]}, 'trigger': {'coords': _fix_coords([trigger_info['charStart'], trigger_info['charEnd']], s_start), 'text': trigger_info['entityText']} } else: logger.info('Unable to get sentence index') annotations = {} text = None if site_coords: annotations['site'] = {'coords': _fix_coords(site_coords, s_start)} return Evidence(text_refs=self._text_refs.copy(), text=text, source_api='rlimsp', pmid=self._text_refs.get('PMID'), annotations=annotations) def get_statements(self): stmts = [] for rel_key, rel_info in self._relations.items(): # Turn the arguments into a dict. args = {e['role']: e['entity_duid'] for e in rel_info['argument']} entity_args = args.copy() # Remove some special cases. trigger_id = entity_args.pop('TRIGGER') trigger_info = self._entity_dict[trigger_id] site_id = entity_args.pop('SITE', None) # Get the entity ids. entities = {role: self._get_agent(eid) for role, eid in entity_args.items()} rel_type = rel_info['relationType'] if rel_type == 'PHOSPHORYLATION': # Get the agents. enz, enz_coords = entities.get('KINASE', (None, None)) sub, sub_coords = entities.get('SUBSTRATE', (None, None)) if sub is None: continue trigger_text = trigger_info.get('entityText') if enz is not None and enz.name == sub.name and \ 'auto' in trigger_text: is_autophos = True else: is_autophos = False # Get the site residue, position, site_coords = self._get_site(site_id) # Get the evidence ev = self._get_evidence(trigger_info, args, [enz_coords, sub_coords], site_coords) # Turn taxonomy into context, sub TAX takes precedence tax = None if enz and 'TAX' in enz.db_refs: tax = enz.db_refs.pop('TAX') if sub and 'TAX' in sub.db_refs: tax = sub.db_refs.pop('TAX') if tax is not None: context = \ BioContext(species=RefContext(tax, {'TAXONOMY': tax})) ev.context = context if is_autophos: stmt = Autophosphorylation(sub, residue=residue, position=position, evidence=[ev]) else: stmt = Phosphorylation(enz, sub, residue=residue, position=position, evidence=[ev]) stmts.append(stmt) else: logger.warning("Unhandled statement type: %s" % rel_type) return stmts
[docs]def get_agent_from_entity_info(entity_info): """Return an INDRA Agent by processing an entity_info dict.""" # This will be the default name. If we get a gene name, it will # override this rawtext name. raw_text = entity_info['entityText'] name = raw_text # Get the db refs. refs = {'TEXT': raw_text} entries = entity_info['entityId'] if entries is None or entries == {'$undefined': True}: entries = [] ref_counts = Counter([entry['source'] for entry in entries]) for source, count in ref_counts.items(): if source in ('Entrez', 'UniProt') and count > 1: logger.info('%s has %d entries for %s, skipping' % (raw_text, count, source)) return None, None muts = [] for id_dict in entries: if id_dict['source'] == 'Entrez': refs['EGID'] = id_dict['idString'] hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString']) if hgnc_id is not None: # Check against what we may have already inferred from # UniProt. If it disagrees with this, let it be. Inference # from Entrez isn't as reliable. if 'HGNC' in refs.keys(): if refs['HGNC'] != hgnc_id: msg = ('HGNC:%s previously set does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], hgnc_id, refs['EGID']) logger.info(msg) else: refs['HGNC'] = hgnc_id elif id_dict['source'] == 'UniProt': refs['UP'] = id_dict['idString'] hgnc_id = uniprot_client.get_hgnc_id(id_dict['idString']) if hgnc_id: # Check to see if we have a conflict with an HGNC id # found from the Entrez id. If so, overwrite with this # one, in which we have greater faith. if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id: msg = ('Inferred HGNC:%s from UP:%s does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], refs['UP'], hgnc_id, refs['EGID']) logger.info(msg) refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(id_dict['idString']) if gene_name is not None: name = gene_name elif id_dict['source'] in ('Tax', 'NCBI'): # Note that TAX is non-standard but it's popped out later in the # extraction process refs['TAX'] = id_dict['idString'] elif id_dict['source'] == 'CHEBI': refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString'] # These we take as is elif id_dict['source'] in ('MESH', 'OMIM'): if ';' in id_dict['idString']: refs[id_dict['source']] = id_dict['idString'].split(';')[0] else: refs[id_dict['source']] = id_dict['idString'] # CTD is sometimes used for MESH chemical IDs but can also be just '-' elif id_dict['source'] == 'CTD': if id_dict['idString'] != '-': refs['MESH'] = id_dict['idString'] # Handle mutations elif id_dict['source'] == 'Unk' and \ id_dict['entityType'] == 'ProteinMutation': # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk', # 'tool': 'PubTator', 'entityType': 'ProteinMutation'} # Mpk1(Y268A)' if id_dict['idString'].startswith('p|SUB|'): try: # Handle special cases like p|SUB|A|30|P;RS#:104893878 parts = id_dict['idString'].split(';')[0].split('|') residue_from, pos, residue_to = parts[2:5] mut = MutCondition(pos, residue_from, residue_to) muts.append(mut) except Exception as e: logger.info('Could not process mutation %s' % id_dict['idString']) else: logger.info('Unhandled mutation: %s' % id_dict['idString']) else: logger.warning("Unhandled id type: {source}={idString}" .format(**id_dict)) raw_coords = (entity_info['charStart'], entity_info['charEnd']) return Agent(name, db_refs=refs, mutations=muts), raw_coords
def _fix_coords(coords, offset): """Adjust the entity coordinates to the beginning of the sentence.""" if coords is None: return None return tuple([n - offset for n in coords])