Source code for indra.sources.tkg.processor

__all__ = ["TkgProcessor"]

import re
import logging
from typing import Dict, List

from indra.sources.bel import process_bel_stmt

logger = logging.getLogger(__name__)


[docs] class TkgProcessor: """Processor extracting INDRA Statments from textToKnowledgeGraph output. After parsing BEL to INDRA Statements via PyBEL, this processor attaches metadata (confidence, text, pmid, pmcid, etc.) to Evidence objects. Parameters ---------- results : Dict Output data structure of textToKnowledgeGraph to be processed Attributes ---------- statements : List[indra.statements.Statement] A list of INDRA Statements extracted from the results. """ def __init__(self, results): self.results = results self.statements = [] self.skipped = [] # Alternative processing mode (not used by V1 tests but available)
[docs] def extract_statements(self): """Run BEL to INDRA pipeline for all entries in llm_results.""" extractions = self.results.get('LLM_extractions', []) for extraction in extractions: results = extraction.get('Results', []) for entry in results: raw_bel_stmt = entry['bel_statement'] bel_stmt = normalize_bel(raw_bel_stmt) try: pp = process_bel_stmt(bel_stmt) except Exception as e: self.skipped.append(bel_stmt) continue if pp and pp.statements: self.statements += pp.statements else: self.skipped.append(bel_stmt) logger.debug( "textToKnowledgeGraph processor finished: extracted=%d " "skipped=%d total=%d", len(self.statements), len(self.skipped), len(self.results) )
# Fix GO Biological Process names that contain spaces GO_BP_PATTERN = re.compile(r'GO:([A-Za-z0-9\-\s]+)') def normalize_go_terms(bel: str) -> str: """Normalize GO terms like: GO:DNA-templated transcription into: GO:"DNA-templated transcription" so PyBEL can parse them. """ def replacer(match): content = match.group(1) # If already quoted or no spaces in string, we can return as is if '"' in content or "'" in content or ' ' not in content: return f'GO:{content}' return f'GO:"{content}"' return GO_BP_PATTERN.sub(replacer, bel) def normalize_bel(bel: str) -> str: """Apply all normalization steps.""" # For now just normalizing GO terms which appears to be an existing issue. # Can be extended with other processing steps later. bel = normalize_go_terms(bel) return bel