Source code for indra.sources.reach.processor

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import re
import logging
import objectpath
from indra.statements import *
from indra.util import read_unicode_csv
from indra.databases import hgnc_client
import indra.databases.uniprot_client as up_client

logger = logging.getLogger('reach')

[docs]class ReachProcessor(object): """The ReachProcessor extracts INDRA Statements from REACH parser output. Parameters ---------- json_dict : dict A JSON dictionary containing the REACH extractions. pmid : Optional[str] The PubMed ID associated with the extractions. This can be passed in case the PMID cannot be determined from the extractions alone.` Attributes ---------- tree : objectpath.Tree The objectpath Tree object representing the extractions. statements : list[indra.statements.Statement] A list of INDRA Statements that were extracted by the processor. citation : str The PubMed ID associated with the extractions. all_events : dict[str, str] The frame IDs of all events by type in the REACH extraction. """ def __init__(self, json_dict, pmid=None): self.tree = objectpath.Tree(json_dict) self.statements = [] self.citation = pmid if pmid is None: if self.tree is not None: self.citation =\ self.tree.execute("$.events.object_meta.doc_id") self.get_all_events()
[docs] def print_event_statistics(self): """Print the number of events in the REACH output by type.""" logger.info('All events by type') logger.info('-------------------') for k, v in self.all_events.items(): logger.info('%s, %s' % (k, len(v))) logger.info('-------------------')
[docs] def get_all_events(self): """Gather all event IDs in the REACH output by type. These IDs are stored in the self.all_events dict. """ self.all_events = {} events = self.tree.execute("$.events.frames") if events is None: return for e in events: event_type = e.get('type') frame_id = e.get('frame_id') try: self.all_events[event_type].append(frame_id) except KeyError: self.all_events[event_type] = [frame_id]
def print_regulations(self): qstr = "$.events.frames[(@.type is 'regulation')]" res = self.tree.execute(qstr) if res is None: return for r in res: print(r['subtype']) for a in r['arguments']: print(a['type'], '/', a['argument-type'], ':', a['text'])
[docs] def get_modifications(self): """Extract Modification INDRA Statements.""" qstr = "$.events.frames[(@.type is 'protein-modification')]" res = self.tree.execute(qstr) if res is None: return for r in res: modification_type = r.get('subtype') epistemics = self._get_epistemics(r) if epistemics.get('negative'): continue context = self._get_context(r) frame_id = r['frame_id'] args = r['arguments'] site = None theme = None for a in args: if self._get_arg_type(a) == 'theme': theme = a['arg'] elif self._get_arg_type(a) == 'site': site = a['text'] theme_agent = self._get_agent_from_entity(theme) if site is not None: residue, pos = self._parse_site_text(site) else: residue = None pos = None qstr = "$.events.frames[(@.type is 'regulation') and " + \ "(@.arguments[0].arg is '%s')]" % frame_id reg_res = self.tree.execute(qstr) reg_res = list(reg_res) for reg in reg_res: controller_agent = None for a in reg['arguments']: if self._get_arg_type(a) == 'controller': controller = a.get('arg') if controller is not None: controller_agent = \ self._get_agent_from_entity(controller) break sentence = reg['verbose-text'] ev = Evidence(source_api='reach', text=sentence, annotations=context, pmid=self.citation, epistemics=epistemics) args = [controller_agent, theme_agent, residue, pos, ev] # Here ModStmt is a sub-class of Modification ModStmt = modtype_to_modclass.get(modification_type) if ModStmt is None: logger.warning('Unhandled modification type: %s' % modification_type) else: # Handle this special case here because only # enzyme argument is needed if modification_type == 'autophosphorylation': args = [theme_agent, residue, pos, ev] self.statements.append(ModStmt(*args))
[docs] def get_regulate_amounts(self): """Extract RegulateAmount INDRA Statements.""" qstr = "$.events.frames[(@.type is 'transcription')]" res = self.tree.execute(qstr) all_res = [] if res is not None: all_res += list(res) qstr = "$.events.frames[(@.type is 'amount')]" res = self.tree.execute(qstr) if res is not None: all_res += list(res) for r in all_res: subtype = r.get('subtype') epistemics = self._get_epistemics(r) if epistemics.get('negative'): continue context = self._get_context(r) frame_id = r['frame_id'] args = r['arguments'] theme = None for a in args: if self._get_arg_type(a) == 'theme': theme = a['arg'] break if theme is None: continue theme_agent = self._get_agent_from_entity(theme) qstr = "$.events.frames[(@.type is 'regulation') and " + \ "(@.arguments[0].arg is '%s')]" % frame_id reg_res = self.tree.execute(qstr) for reg in reg_res: controller_agent = None for a in reg['arguments']: if self._get_arg_type(a) == 'controller': controller = a.get('arg') if controller is not None: controller_agent = \ self._get_agent_from_entity(controller) break sentence = reg['verbose-text'] ev = Evidence(source_api='reach', text=sentence, annotations=context, pmid=self.citation, epistemics=epistemics) args = [controller_agent, theme_agent, ev] subtype = reg.get('subtype') if subtype == 'positive-regulation': st = IncreaseAmount(*args) else: st = DecreaseAmount(*args) self.statements.append(st)
[docs] def get_complexes(self): """Extract INDRA Complex Statements.""" qstr = "$.events.frames[@.type is 'complex-assembly']" res = self.tree.execute(qstr) if res is None: return for r in res: epistemics = self._get_epistemics(r) if epistemics.get('negative'): continue context = self._get_context(r) args = r['arguments'] sentence = r['verbose-text'] members = [] for a in args: agent = self._get_agent_from_entity(a['arg']) members.append(agent) ev = Evidence(source_api='reach', text=sentence, annotations=context, pmid=self.citation, epistemics=epistemics) self.statements.append(Complex(members, ev))
[docs] def get_activation(self): """Extract INDRA Activation Statements.""" qstr = "$.events.frames[@.type is 'activation']" res = self.tree.execute(qstr) if res is None: return for r in res: epistemics = self._get_epistemics(r) if epistemics.get('negative'): continue sentence = r['verbose-text'] context = self._get_context(r) ev = Evidence(source_api='reach', text=sentence, pmid=self.citation, annotations=context, epistemics=epistemics) args = r['arguments'] for a in args: if self._get_arg_type(a) == 'controller': controller = a.get('arg') # When the controller is not a simple entity if controller is None: if a['argument-type'] == 'complex': controllers = list(a.get('args').values()) controller_agent =\ self._get_agent_from_entity(controllers[0]) bound_agents = [self._get_agent_from_entity(c) for c in controllers[1:]] bound_conditions = [BoundCondition(ba, True) for ba in bound_agents] controller_agent.bound_conditions = \ bound_conditions else: controller_agent =\ self._get_agent_from_entity(controller) if self._get_arg_type(a) == 'controlled': controlled = a['arg'] controlled_agent = self._get_agent_from_entity(controlled) if r['subtype'] == 'positive-activation': st = Activation(controller_agent, controlled_agent, evidence=ev) else: st = Inhibition(controller_agent, controlled_agent, evidence=ev) self.statements.append(st)
[docs] def get_translocation(self): """Extract INDRA Translocation Statements.""" qstr = "$.events.frames[@.type is 'translocation']" res = self.tree.execute(qstr) if res is None: return for r in res: epistemics = self._get_epistemics(r) if epistemics.get('negative'): continue sentence = r['verbose-text'] context = self._get_context(r) ev = Evidence(source_api='reach', text=sentence, pmid=self.citation, annotations=context, epistemics=epistemics) args = r['arguments'] from_location = None to_location = None for a in args: if self._get_arg_type(a) == 'theme': agent = self._get_agent_from_entity(a['arg']) if agent is None: continue elif self._get_arg_type(a) == 'source': from_location = self._get_location_by_id(a['arg']) elif self._get_arg_type(a) == 'destination': to_location = self._get_location_by_id(a['arg']) st = Translocation(agent, from_location, to_location, evidence=ev) self.statements.append(st)
def _get_location_by_id(self, loc_id): qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % loc_id res = self.tree.execute(qstr) if res is None: return None try: entity_term = next(res) except StopIteration: logger.debug(' %s is not an entity' % loc_id) return None name = entity_term.get('text') go_id = None for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'go': go_id = xr['id'] # Try to get valid location based on GO id if go_id is not None: try: loc = get_valid_location(go_id) return loc except InvalidLocationError: pass # See if the raw name is a valid cellular component try: loc = get_valid_location(name.lower()) return loc except InvalidLocationError: pass return None def _get_agent_from_entity(self, entity_id): qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % entity_id res = self.tree.execute(qstr) if res is None: return None try: entity_term = next(res) except StopIteration: logger.debug(' %s is not an entity' % entity_id) return None # This is the default name, which can be overwritten # below for specific database entries agent_name = self._get_valid_name(entity_term['text']) db_refs = {} for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'uniprot': up_id = xr['id'] db_refs['UP'] = up_id # Look up official names in UniProt gene_name = up_client.get_gene_name(up_id) if gene_name is not None: agent_name = self._get_valid_name(gene_name) # If the gene name corresponds to an HGNC ID, add it to the # db_refs hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif ns == 'hgnc': hgnc_id = xr['id'] db_refs['HGNC'] = hgnc_id # Look up the standard gene symbol and set as name hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_name: agent_name = hgnc_name # Look up the corresponding uniprot id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id elif ns == 'pfam': be_id = bioentities_map.get(('PF', xr['id'])) if be_id: db_refs['BE'] = be_id agent_name = be_id db_refs['PF'] = xr['id'] elif ns == 'interpro': be_id = bioentities_map.get(('IP', xr['id'])) if be_id: db_refs['BE'] = be_id agent_name = be_id db_refs['IP'] = xr['id'] elif ns == 'chebi': db_refs['CHEBI'] = xr['id'] elif ns == 'pubchem': db_refs['PUBCHEM'] = xr['id'] elif ns == 'go': db_refs['GO'] = xr['id'] elif ns == 'mesh': db_refs['MESH'] = xr['id'] elif ns == 'hmdb': db_refs['HMDB'] = xr['id'] elif ns == 'simple_chemical': if xr['id'].startswith('HMDB'): db_refs['HMDB'] = xr['id'] elif ns == 'be': db_refs['BE'] = xr['id'] agent_name = db_refs['BE'] # These name spaces are ignored elif ns in ['uaz']: pass else: logger.warning('Unhandled xref namespace: %s' % ns) db_refs['TEXT'] = entity_term['text'] mod_terms = entity_term.get('modifications') mods = [] muts = [] if mod_terms is not None: for m in mod_terms: if m['type'].lower() == 'mutation': # Evidence is usualy something like "V600E" # We could parse this to get the amino acid # change that happened. mutation_str = m.get('evidence') # TODO: sometimes mutation_str is "mutant", "Mutant", # "mutants" - this indicates that there is a mutation # but not the specific type. We should encode this # somehow as a "blank" mutation condition mut = self._parse_mutation(mutation_str) if mut is not None: muts.append(mut) else: mc = self._get_mod_condition(m) if mc is not None: mods.append(mc) agent = Agent(agent_name, db_refs=db_refs, mods=mods, mutations=muts) return agent def _get_mod_condition(self, mod_term): site = mod_term.get('site') if site is not None: mod_res, mod_pos = self._parse_site_text(site) else: mod_res = None mod_pos = None mod_type_str = mod_term['type'].lower() mod_state = agent_mod_map.get(mod_type_str) if mod_state is not None: mc = ModCondition(mod_state[0], residue=mod_res, position=mod_pos, is_modified=mod_state[1]) return mc logger.warning('Unhandled entity modification type: %s' % mod_type_str) return None def _get_context(self, frame_term): context = {} context['found_by'] = frame_term['found_by'] try: context_id = frame_term['context'] except KeyError: return context # For backwards compatibility with older versions # of REACH if isinstance(context_id, dict): context_term = context_id species = context_term.get('Species') cell_type = context_term.get('CellType') cell_line = None location = None tissue = None organ = None else: qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % context_id[0] res = self.tree.execute(qstr) if res is None: return context context_frame = next(res) facets = context_frame['facets'] cell_line = facets.get('cell-line') cell_type = facets.get('cell-type') species = facets.get('organism') location = facets.get('location') tissue = facets.get('tissue_type') organ = facets.get('organ') context['species'] = species context['cell_type'] = cell_type context['cell_line'] = cell_line context['location'] = location context['tissue'] = tissue context['organ'] = organ return context def _get_epistemics(self, event): epistemics = {} # Check whether information is negative neg = event.get('is_negated') if neg is True: epistemics['negative'] = True # Check if it is a hypothesis hyp = event.get('is_hypothesis') if hyp is True: epistemics['hypothesis'] = True # Check if it is direct if 'is_direct' in event: direct = event['is_direct'] epistemics['direct'] = direct # Get the section of the paper it comes from section = self._get_section(event) epistemics['section_type'] = section return epistemics _section_list = ['title', 'abstract', 'introduction', 'background', 'results', 'methods', 'discussion', 'conclusion', 'supplementary', 'figure'] def _get_section(self, event): """Get the section of the paper that the event is from.""" sentence_id = event.get('sentence') section = None if sentence_id: qstr = "$.sentences.frames[(@.frame_id is \'%s\')]" % sentence_id res = self.tree.execute(qstr) if res: sentence_frame = list(res)[0] passage_id = sentence_frame.get('passage') if passage_id: qstr = "$.sentences.frames[(@.frame_id is \'%s\')]" % \ passage_id res = self.tree.execute(qstr) if res: passage_frame = list(res)[0] section = passage_frame.get('section-id') # If the section is in the standard list, return as is if section in self._section_list: return section # Next, handle a few special cases that come up in practice elif section.startswith('fig'): return 'figure' elif section.startswith('supm'): return 'supplementary' elif section == 'article-title': return 'title' elif section in ['subjects|methods', 'methods|subjects']: return 'methods' elif section == 'conclusions': return 'conclusion' elif section == 'intro': return 'introduction' else: return None @staticmethod def _get_arg_type(arg): """Return the type of the argument with backwards compatibility.""" if arg.get('argument_label') is not None: return arg.get('argument_label') else: return arg.get('type') @staticmethod def _get_valid_name(txt): """Produce valid agent name from string.""" name = ''.join(ch if ch.isalnum() else '_' for ch in txt) if name and name[0].isdigit(): name = 'p' + name return name @staticmethod def _parse_mutation(s): m = re.match(r'([A-Z])([0-9]+)([A-Z])', s.upper()) if m is not None: parts = [str(g) for g in m.groups()] residue_from = get_valid_residue(parts[0]) residue_to = get_valid_residue(parts[2]) position = parts[1] mut = MutCondition(position, residue_from, residue_to) return mut elif s.lower() in ('mutation', 'mutations', 'mutant', 'mutants', 'mutational'): mut = MutCondition(None, None, None) return mut else: logger.warning('Unhandled mutation string: %s' % s) return None @staticmethod def _parse_site_text(s): for p in (_site_pattern1, _site_pattern2, _site_pattern3): m = re.match(p, s.upper()) if m is not None: residue = get_valid_residue(m.groups()[0]) site = m.groups()[1] return residue, site m = re.match(_site_pattern4, s.upper()) if m is not None: site = m.groups()[0] residue = m.groups()[1] return residue, site for p in (_site_pattern5, _site_pattern6, _site_pattern7): m = re.match(p, s.upper()) if m is not None: residue = get_valid_residue(m.groups()[0]) site = None return residue, site m = re.match(_site_pattern8, s.upper()) if m is not None: site = m.groups()[0] residue = None return residue, site logger.warning('Could not parse site text %s' % s) return None, None
_site_pattern1 = '([' + ''.join(list(amino_acids.keys())) + '])[-]?([0-9]+)$' _site_pattern2 = '(' + '|'.join([v['short_name'].upper() for v in amino_acids.values()]) + \ ')[- ]?([0-9]+)$' _site_pattern3 = '(' + '|'.join([v['indra_name'].upper() for v in amino_acids.values()]) + \ ')[^0-9]*([0-9]+)$' _site_pattern4 = '([0-9]+)[ ]?([' + ''.join(list(amino_acids.keys())) + '])$' _site_pattern5 = '^([' + ''.join(list(amino_acids.keys())) + '])$' _site_pattern6 = '^(' + '|'.join([v['short_name'].upper() for v in amino_acids.values()]) + ')$' _site_pattern7 = '.*(' + '|'.join([v['indra_name'].upper() for v in amino_acids.values()]) + ').*' _site_pattern8 = '([0-9]+)$' # Subtypes that exist but we don't handle: methylation, hydrolysis agent_mod_map = { 'phosphorylation': ('phosphorylation', True), 'phosphorylated': ('phosphorylation', True), 'dephosphorylation': ('phosphorylation', False), 'acetylation': ('acetylation', True), 'deacetylation': ('acetylation', False), 'ubiquitination': ('ubiquitination', True), 'deubiquitination': ('ubiquitination', False), 'hydroxylation': ('hydroxylation', True), 'dehydroxylation': ('hydroxylation', False), 'sumoylation': ('sumoylation', True), 'desumoylation': ('sumoylation', False), 'glycosylation': ('glycosylation', True), 'deglycosylation': ('glycosylation', False), 'farnesylation': ('farnesylation', True), 'defarnesylation': ('farnesylation', False), 'ribosylation': ('ribosylation', True), 'deribosylation': ('ribosylation', False), 'methylation': ('methylation', True), 'demethylation': ('methylation', False), 'unknown': ('modification', True), } def _read_bioentities_map(): fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../resources/bioentities_map.tsv') bioentities_map = {} csv_rows = read_unicode_csv(fname, delimiter='\t') for row in csv_rows: source_ns = row[0] source_id = row[1] be_id = row[2] bioentities_map[(source_ns, source_id)] = be_id return bioentities_map bioentities_map = _read_bioentities_map()