# -*- coding: utf-8 -*-
"""Processors for CREEDS data."""
from copy import copy
import logging
from typing import (
Any,
ClassVar,
Iterable,
List,
Mapping,
Optional,
Tuple,
Type,
)
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
from indra import statements
from indra.databases import hgnc_client
from indra.ontology.bio import bio_ontology
from indra.ontology.standardize import get_standard_agent
from indra.sources.utils import Processor
from indra.statements import Agent, BioContext, Evidence, RefContext, Statement
from protmapper import uniprot_client
from protmapper.uniprot_client import get_id_from_mgi_name, get_id_from_rgd_name
logger = logging.getLogger(__name__)
__all__ = [
"CREEDSGeneProcessor",
"CREEDSChemicalProcessor",
"CREEDSDiseaseProcessor",
]
#: Organism label to NCBI Taxonomy Identifier
ORGANISMS = {
"mouse": "10090",
"human": "9606",
"rat": "10116",
}
#: A mapping from labels used in CREEDS for species to their
#: organism-specific nomenclature name
ORGANISMS_TO_NS = {
"mouse": "MGI",
"human": "HGNC",
"rat": "RGD",
}
#: A mapping of strings used in the "pert_type" entries in
#: CREEDS data to normalized keys. Several are not curated
#: because they do not readily map to an INDRA statement
PERTURBATIONS = {
# knockout
"ko": "knockout",
"deletion": "knockout",
"null mutation": "knockout",
# knockdown
"kd": "knockdown",
"deficiency (mutation)": "knockdown",
"silencing": "knockdown",
"heterozygotic knockout (nf1+/-)": "knockout",
"shrna": "knockdown",
# increase
"induction": "increase",
"knock-in": "increase",
"oe": "increase",
"overexpression": "increase",
"stimulation of gene product": "increase",
# activation
"agonist activation": "activation",
"drugactivation": "activation",
"activemutant": "activation",
"activation (deltanb-cateniner transgenics)": "activation",
# inhibition
"druginhibition": "inhibition",
"drug inhibition": "inhibition",
"inhibition": "inhibition",
"inactivation (ikk inhibition)": "inhibition",
"deficiency": "inhibition",
"depletion": "inhibition",
"depletion - sirna simut12": "inhibition",
"small molecule inhibition": "inhibition",
"defectivemutant": "inhibition",
"inactivation": "inhibition",
"drug": "inhibition",
# misc
"mutant": "mutation",
"rd1 mutation": "mutation",
"natural variation": "mutation",
"mutation (htt exon1 142q)": "mutation",
"g93a mutation": "mutation",
}
#: A mapping of perturbation types to statement types for the target
#: genes whose gene expression increases from the given perturbation
#: of the subject gene
UP_MAP: Mapping[str, Type[statements.RegulateAmount]] = {
"knockout": statements.DecreaseAmount,
"knockdown": statements.DecreaseAmount,
"inhibition": statements.DecreaseAmount,
"increase": statements.IncreaseAmount,
"activation": statements.IncreaseAmount,
}
#: A mapping of perturbation types to statement types for the target
#: genes whose gene expression decreases from the given perturbation
#: of the subject gene
DOWN_MAP: Mapping[str, Type[statements.RegulateAmount]] = {
"knockout": statements.IncreaseAmount,
"knockdown": statements.IncreaseAmount,
"inhibition": statements.IncreaseAmount,
"increase": statements.DecreaseAmount,
"activation": statements.DecreaseAmount,
}
def _process_pert_type(s: str) -> str:
x = s.strip().lower()
return PERTURBATIONS.get(x, x)
MISSING_NAMES = set()
def _get_genes(
record: Mapping[str, Any],
prefix: str,
key: str,
) -> List[Tuple[str, str, str]]:
rv = []
#: A list of 2-tuples with the gene symbol then the expression value
expressions = record[key]
for symbol, _ in expressions:
if prefix == "HGNC":
current_id = hgnc_client.get_current_hgnc_id(symbol)
# We may get no current IDs or more than one current IDs
# in which case we skip this gene
if not current_id or isinstance(current_id, list):
identifier = None
else:
identifier = current_id
_prefix = "HGNC"
elif prefix == "MGI":
_prefix, identifier = "UP", get_id_from_mgi_name(symbol)
elif prefix == "RGD":
_prefix, identifier = "UP", get_id_from_rgd_name(symbol)
else:
raise ValueError(f"Invalid prefix: {prefix} ! {symbol}")
if identifier is None:
if (prefix, symbol) not in MISSING_NAMES:
logger.debug(
f"Could not look up {symbol} by name in {prefix}",
)
MISSING_NAMES.add((prefix, symbol))
continue
rv.append((_prefix, identifier, symbol))
return rv
def _get_evidence(record: Mapping[str, Any]) -> Evidence:
# TODO how to use the following metadata?
geo_id = record["geo_id"]
cell_type = record["cell_type"]
organism = record["organism"]
return Evidence(
source_api="creeds",
annotations={
# TODO use Gilda for grounding and put in BioContext?
"cell_type": cell_type,
"geo": geo_id,
},
context=BioContext(
species=RefContext(
name=organism,
db_refs={"TAXONOMY": ORGANISMS[organism]},
)
),
)
def _get_regulations(
record: Mapping[str, Any],
) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str, str]]]:
organism = record["organism"]
prefix = ORGANISMS_TO_NS[organism]
up_genes = _get_genes(record, prefix, "up_genes")
down_genes = _get_genes(record, prefix, "down_genes")
return up_genes, down_genes
def _process_record_helper(
record, subject, up_stmt_cls, down_stmt_cls
) -> Iterable[Statement]:
up_genes, down_genes = _get_regulations(record)
evidence = _get_evidence(record)
for prefix, identifier, name in up_genes:
target = get_standard_agent(name, {prefix: identifier})
yield up_stmt_cls(subject, target, copy(evidence))
for prefix, identifier, name in down_genes:
target = get_standard_agent(name, {prefix: identifier})
yield down_stmt_cls(subject, target, copy(evidence))
class CREEDSProcessor(Processor):
"""A base processor for CREEDS, which takes records as input."""
#: The processed statements (after ``extract_statements()`` is run)
statements: List[Statement]
def __init__(self, records: List[Mapping[str, Any]]):
self.records = records
self.statements = []
def extract_statements(self) -> List[Statement]:
"""Generate/store statements if not pre-cached, then return then."""
if not self.statements:
self.statements = list(self.iter_statements())
return self.statements
def iter_statements(self) -> Iterable[Statement]:
for record in tqdm(self.records, desc=f"Processing {self.name}"):
yield from self.process_record(record)
@classmethod
def process_record(cls, record: Mapping[str, Any]) -> Iterable[Statement]:
raise NotImplementedError
LOGGED_MISSING_PART = set()
[docs]class CREEDSGeneProcessor(CREEDSProcessor):
"""A processor for single gene perturbation experiments in CREEDS."""
name = "creeds_gene"
@staticmethod
def get_subject(record) -> Optional[Agent]:
ncbigene_id = record["id"][len("gene:") :]
uniprot_id = uniprot_client.get_id_from_entrez(ncbigene_id)
if uniprot_id is None:
logger.debug(f"Could not convert ncbigene:{ncbigene_id} to UniProt")
return None
name = uniprot_client.get_gene_name(uniprot_id)
return get_standard_agent(
name,
{
"EGID": ncbigene_id,
"UP": uniprot_id,
},
)
@classmethod
def process_record(cls, record: Mapping[str, Any]) -> Iterable[Statement]:
subject = cls.get_subject(record)
if subject is None:
return
pert_type = _process_pert_type(record["pert_type"])
up_stmt_cls = UP_MAP.get(pert_type)
down_stmt_cls = DOWN_MAP.get(pert_type)
if up_stmt_cls is None or down_stmt_cls is None:
if pert_type not in LOGGED_MISSING_PART:
logger.debug(f"Could not look up pert_type {record['pert_type']}")
LOGGED_MISSING_PART.add(pert_type)
return
yield from _process_record_helper(
record,
subject,
up_stmt_cls,
down_stmt_cls,
)
[docs]class CREEDSDiseaseProcessor(CREEDSProcessor):
"""A processor for disease perturbation experiments in CREEDS."""
name = "creeds_disease"
@staticmethod
def get_subject(record) -> Agent:
db_refs = {}
doid = record["do_id"]
if doid:
db_refs["DOID"] = doid
umls_id = record["umls_cui"]
if umls_id:
db_refs["UMLS"] = umls_id
name = record["disease_name"]
return get_standard_agent(name, db_refs)
@classmethod
def process_record(cls, record) -> Iterable[Statement]:
subject = cls.get_subject(record)
yield from _process_record_helper(
record,
subject,
up_stmt_cls=statements.IncreaseAmount,
down_stmt_cls=statements.DecreaseAmount,
)
[docs]class CREEDSChemicalProcessor(CREEDSProcessor):
"""A processor for chemical perturbation experiments in CREEDS."""
name = "creeds_chemical"
@staticmethod
def get_subject(record) -> Agent:
db_refs = {}
smiles = record["smiles"]
if smiles:
db_refs["SMILES"] = smiles
pubchem_compound_id = record["pubchem_cid"]
if pubchem_compound_id:
db_refs["PUBCHEM"] = str(pubchem_compound_id)
drugbank_id = record["drugbank_id"]
if drugbank_id:
db_refs["DRUGBANK"] = drugbank_id
name = record["drug_name"]
return get_standard_agent(name, db_refs)
@classmethod
def process_record(cls, record) -> Iterable[Statement]:
subject = cls.get_subject(record)
yield from _process_record_helper(
record,
subject,
up_stmt_cls=statements.IncreaseAmount,
down_stmt_cls=statements.DecreaseAmount,
)