__all__ = ["TkgProcessor"]
import re
import logging
from typing import Dict, List
from indra.sources.bel import process_bel_stmt
logger = logging.getLogger(__name__)
[docs]
class TkgProcessor:
"""Processor extracting INDRA Statments from textToKnowledgeGraph output.
After parsing BEL to INDRA Statements via PyBEL, this processor attaches
metadata (confidence, text, pmid, pmcid, etc.) to Evidence objects.
Parameters
----------
results : Dict
Output data structure of textToKnowledgeGraph to be processed
Attributes
----------
statements : List[indra.statements.Statement]
A list of INDRA Statements extracted from the results.
"""
def __init__(self, results):
self.results = results
self.statements = []
self.skipped = []
# Alternative processing mode (not used by V1 tests but available)
# Fix GO Biological Process names that contain spaces
GO_BP_PATTERN = re.compile(r'GO:([A-Za-z0-9\-\s]+)')
def normalize_go_terms(bel: str) -> str:
"""Normalize GO terms like:
GO:DNA-templated transcription
into:
GO:"DNA-templated transcription"
so PyBEL can parse them.
"""
def replacer(match):
content = match.group(1)
# If already quoted or no spaces in string, we can return as is
if '"' in content or "'" in content or ' ' not in content:
return f'GO:{content}'
return f'GO:"{content}"'
return GO_BP_PATTERN.sub(replacer, bel)
def normalize_bel(bel: str) -> str:
"""Apply all normalization steps."""
# For now just normalizing GO terms which appears to be an existing issue.
# Can be extended with other processing steps later.
bel = normalize_go_terms(bel)
return bel