Source code for indra.sources.biofactoid.processor

from copy import deepcopy
from collections import defaultdict
from indra.statements import *
from indra.util import flatten


[docs]class BioFactoidProcessor: """Processor which extracts INDRA Statements from BioFactoid JSON. Parameters ---------- biofactoid_json : json BioFactoid JSON to process. Attributes ---------- statements : list[indra.statements.Statement] A list of INDRA Statements extracted from the BioFactoid JSON. """ def __init__(self, biofactoid_json): self.biofactoid_json = biofactoid_json self.statements = [] def extract_statements(self): for document in self.biofactoid_json: self.statements += self.process_document(document) def process_document(self, document): ev = self.get_evidence(document) stmts = [] for interaction in self.find_interactions(document['elements']): groups = defaultdict(list) for entry in interaction['entries']: groups[entry['group']].append( self.agent_from_element( self.find_element(document['elements'], entry['id']) ) ) groups = dict(groups) if interaction['type'] in mod_types: stmt_type = interaction_types[interaction['type']] enz = groups[None][0] if None in groups else None sub = None act_inh_stmt = None for polarity in {'positive', 'unsigned', 'negative'}: if polarity in groups: if polarity == 'positive': act_inh_stmt = Activation elif polarity == 'negative': act_inh_stmt = Inhibition sub = groups[polarity][0] break if sub and enz: stmt = stmt_type(deepcopy(enz), deepcopy(sub), evidence=deepcopy(ev)) stmts.append(stmt) if act_inh_stmt: stmt = act_inh_stmt(deepcopy(enz), deepcopy(sub), evidence=deepcopy(ev)) stmts.append(stmt) elif interaction['type'] == 'transcription-translation': subj = groups[None][0] if None in groups else None obj = None for polarity in {'positive', 'unsigned', 'negative'}: if polarity in groups: obj = groups[polarity][0] break else: polarity = None if polarity == 'positive': stmt_type = IncreaseAmount elif polarity == 'negative': stmt_type = DecreaseAmount if subj and obj and polarity: stmt = stmt_type(deepcopy(subj), deepcopy(obj), evidence=deepcopy(ev)) stmts.append(stmt) elif interaction['type'] == 'binding': members = flatten(list(groups.values())) stmt = Complex(deepcopy(members), evidence=deepcopy(ev)) stmts.append(stmt) return stmts def agent_from_element(self, element): name = element['name'] db_refs = {} if not element['completed']: return None mapped_ns, mapped_id = \ process_db_refs(element['association']['namespace'], element['association']['id']) if mapped_ns and mapped_id: db_refs[mapped_ns] = mapped_id for ref in element['association'].get('dbXrefs', []): mapped_ns, mapped_id = process_db_refs(ref['db'], ref['id']) if mapped_ns and mapped_id: db_refs[mapped_ns] = mapped_id return Agent(name, db_refs=db_refs) def find_element(self, elements, element_id): for element in elements: if element['id'] == element_id: return element def find_interactions(self, elements): return [e for e in elements if e['type'] in interaction_types] def get_evidence(self, document): text_refs = self.get_text_refs(document) pmid = text_refs.get('PMID') annotations = { 'biofactoid_document': document['id'], 'created_date': document.get('createdDate'), 'lsatEditedDate': document.get('lastEditedDate'), } ev = Evidence(source_api='biofactoid', pmid=pmid, text_refs=text_refs, text=document['text'], annotations=annotations) return ev def get_text_refs(self, document): text_refs = {} cit = document['citation'] if 'pmid' in cit: text_refs['PMID'] = cit['pmid'] if 'doi' in cit: text_refs['DOI'] = cit['doi'] pmd = document['article']['PubmedData'] for entry in pmd['ArticleIdList']: if entry['IdType'] == 'pubmed': text_refs['PMID'] = entry['id'] elif entry['IdType'] == 'doi': text_refs['DOI'] = entry['id'] elif entry['IdType'] == 'pii': text_refs['PII'] = entry['id'] elif entry['IdType'] == 'pmc': text_refs['PMCID'] = entry['id'] return text_refs
def process_db_refs(db_ns, db_id): if db_ns == 'HGNC': return 'HGNC', db_id.replace('HGNC:', '') elif db_ns == 'Ensembl': return 'ENSEMBL', db_id elif db_ns == 'ncbi': return 'EGID', db_id elif db_ns == 'MGI': return 'MGI', db_id elif db_ns == 'RGD': return 'RGD', db_id elif db_ns == 'ChEBI': return 'CHEBI', 'CHEBI:%s' % db_id return None, None # TODO: there's probably more but this is what is visible so far interaction_types = { 'binding': Complex, 'interaction': Complex, 'modification': AddModification, 'phosphorylation': Phosphorylation, 'methylation': Methylation, 'demethylation': Demethylation, 'ubiquitination': Ubiquitination, 'deubiquitination': Deubiquitination, 'transcription-translation': IncreaseAmount, } mod_types = {'phosphorylation', 'dephosphorylation', 'ubiquitination', 'deubiquitination', 'modification', 'methylation', 'demethylation'}