Source code for indra.sources.tkg.api

__all__ = ["process_json_file", "process_json", "process_pmc"]
"""
This module implements an API for the textToKnowledgeGraph
method which extracts BEL statements from publications via an LLM.

This module provides two integration modes:

Offline processing
    In this mode, a JSON file output from textToKnowledgeGraph
    is used as the starting point from which INDRA Statements are produced.

Live processing
    If the `texttoknowledgegraph` package is installed, calls
    the LLM extraction pipeline, processes the returned BEL relations
    and produces INDRA Statements.

Both modes produce an TkgProcessor instance containing INDRA
Statements derived from BEL expressions.
"""

import os
import json
import logging
from pathlib import Path
from typing import Dict, Union

from indra import get_config
from .processor import TkgProcessor

logger = logging.getLogger(__name__)



[docs] def process_json_file(path: Union[str, Path]): """Process a single textToKnowledgeGraph JSON results file. Parameters ---------- path : str or Path Path to a JSON file containing BEL relations. Returns ------- TkgProcessor Processor containing the converted INDRA Statements. """ path = Path(path) logger.debug("Processing LLM-BEL results file: %s", path) with open(path, "r") as fh: data = json.load(fh) return process_json(data)
[docs] def process_json(data: Dict): """Process BEL relations returned directly from the LLM engine. Parameters ---------- data : dict Dictionary containing at least a ``"relations"`` field. Returns ------- TkgProcessor Processor with INDRA Statements derived from BEL. """ processor = TkgProcessor(data) processor.extract_statements() return processor
[docs] def process_pmc(pmc_id: str, output_base_path, **kwargs): """Run live BEL extraction using textToKnowledgeGraph, if installed. Parameters ---------- pmc_id : str PMCID such as 'PMC3898398'. kwargs : Additional keyword arguments passed to textToKnowledgeGraph.main(). Returns ------- TkgProcessor Processor containing INDRA Statements derived from live BEL output. Raises ------ ImportError If textToKnowledgeGraph is not installed. ValueError If the returned data structure is unexpected. """ try: from textToKnowledgeGraph import main as tkg_main except ImportError: raise ImportError( "The 'textToKnowledgeGraph' package is not installed. " "Install it or run textToKnowledgeGraph separately to " "produce output files and then use one of the functions like " "process_json_file to process the outputs." ) api_key = get_config('OPENAI_API_KEY', failure_ok=False) logger.debug("Running live textToKnowledgeGraph extraction for %s", pmc_id) success = tkg_main( api_key=api_key, pmc_ids=[pmc_id], upload_to_ndex=False, # Note: this assumes https://github.com/ndexbio/llm-text-to-knowledge-graph/pull/27 # will be merged output_base_path=output_base_path, **kwargs, ) if success: # TKG doesn't explicitly say where the results will be put so we need to # construct this path ourselves output_path = os.path.join(output_base_path, 'results', pmc_id, 'llm_results.json') return process_json_file(output_path) return None