import pickle
import logging
import numpy as np
import pandas as pd
from collections import Counter
from typing import Union, Sequence, Optional, List, Any
from sklearn.base import BaseEstimator
from indra.statements import Evidence, Statement, Modification, \
get_all_descendants
from indra.belief import BeliefScorer, check_extra_evidence, \
get_stmt_evidence, SimpleScorer
logger = logging.getLogger(__name__)
[docs]class SklearnScorer(BeliefScorer):
"""Use a pre-trained Sklearn classifier to predict belief scores.
An implementing instance of this base class has two personalities: as a
subclass of BeliefScorer, it implements the functions required by the
BeliefEngine, `score_statements` and `check_prior_probs`. It also behaves
like an sklearn model by composition, implementing methods `fit`,
`predict`, `predict_proba`, and `predict_log_proba`, which are passed
through to an internal sklearn model.
A key role of this wrapper class is to implement the preprocessing of
statement properties into a feature matrix in a standard way, so that
a classifier trained on one corpus of statement data will still work when
used on another corpus.
Implementing subclasses must implement at least one of the methods for
building the feature matrix, `stmts_to_matrix` or `df_to_matrix`.
Parameters
----------
model :
Any instance of a classifier object supporting the methods `fit`,
`predict_proba`, `predict`, and `predict_log_proba`.
"""
def __init__(
self,
model: BaseEstimator,
):
self.model = model
[docs] def check_prior_probs(
self,
statements: Sequence[Statement],
) -> None:
"""Empty implementation for now."""
pass
[docs] def score_statements(
self,
statements: Sequence[Statement],
extra_evidence: Optional[List[List[Evidence]]] = None,
) -> Sequence[float]:
return self.predict_proba(statements, extra_evidence)[:, 1]
[docs] def stmts_to_matrix(
self,
stmts: Sequence[Statement],
extra_evidence: Optional[List[List[Evidence]]] = None,
) -> np.ndarray:
"""Convert a list of Statements to a feature matrix."""
raise NotImplementedError('Need to implement the stmts_to_matrix '
'method')
[docs] def df_to_matrix(
self,
df: pd.DataFrame,
) -> np.ndarray:
"""Convert a statement DataFrame to a feature matrix."""
raise NotImplementedError('Need to implement the df_to_matrix '
'method')
[docs] def to_matrix(self,
stmt_data: Union[np.ndarray, Sequence[Statement], pd.DataFrame],
extra_evidence: Optional[List[List[Evidence]]] = None,
) -> np.ndarray:
"""Get stmt feature matrix by calling appropriate method.
If `stmt_data` is already a matrix (e.g., obtained after performing a
train/test split on a matrix generated for a full statement corpus), it
is returned directly; if a DataFrame of Statement metadata,
`self.df_to_matrix` is called; if a list of Statements,
`self.stmts_to_matrix` is called.
Parameters
----------
stmt_data :
Statement content to be used to generate a feature matrix.
extra_evidence :
A list corresponding to the given list of statements, where
each entry is a list of Evidence objects providing additional
support for the corresponding statement (i.e., Evidences that
aren't already included in the Statement's own evidence list).
Returns
-------
:
Feature matrix for the statement data.
"""
# If we got a Numpy array, just use it!
if isinstance(stmt_data, np.ndarray):
stmt_arr = stmt_data
# Otherwise check if we have a dataframe or a list of statements
# and call the appropriate *_to_matrix method
elif isinstance(stmt_data, pd.DataFrame):
if extra_evidence is not None:
raise NotImplementedError(
'extra_evidence cannot be used with a statement DataFrame.')
stmt_arr = self.df_to_matrix(stmt_data)
# Check if stmt_data is a list/tuple (i.e., of Statements):
elif isinstance(stmt_data, (list, tuple)):
# Check that the first entry is a Statement
if not isinstance(stmt_data[0], Statement):
raise ValueError('stmt_data must contain Statements.')
stmt_arr = self.stmts_to_matrix(stmt_data, extra_evidence)
# If it's something else, error
else:
raise TypeError(f'stmt_data is type {type(stmt_data)}: '
'must be a numpy array, DataFrame, or '
'list/tuple of Statements')
return stmt_arr
[docs] def fit(self,
stmt_data: Union[np.ndarray, Sequence[Statement], pd.DataFrame],
y_arr: Sequence[float],
extra_evidence: Optional[List[List[Evidence]]] = None,
*args,
**kwargs,
):
"""Preprocess stmt data and run sklearn model `fit` method.
Additional `args` and `kwargs` are passed to the `fit` method of the
wrapped sklearn model.
Parameters
----------
stmt_data :
Statement content to be used to generate a feature matrix.
y_arr :
Class values for the statements (e.g., a vector of 0s and 1s
indicating correct or incorrect).
extra_evidence :
A list corresponding to the given list of statements, where
each entry is a list of Evidence objects providing additional
support for the corresponding statement (i.e., Evidences that
aren't already included in the Statement's own evidence list).
"""
# Check dimensions of stmts (x) and y_arr
if len(stmt_data) != len(y_arr):
raise ValueError("Number of stmts/rows must match length of y_arr.")
# Get the data matrix based on the stmt list or stmt DataFrame
stmt_arr = self.to_matrix(stmt_data, extra_evidence)
# Call the fit method of the internal sklearn model
self.model.fit(stmt_arr, y_arr, *args, **kwargs)
[docs] def predict_proba(
self,
stmt_data: Union[np.ndarray, Sequence[Statement], pd.DataFrame],
extra_evidence: Optional[List[List[Evidence]]] = None,
*args,
**kwargs,
) -> np.ndarray:
"""Preprocess stmt data and run sklearn model `predict_proba` method.
Additional `args` and `kwargs` are passed to the `predict_proba` method
of the wrapped sklearn model.
Parameters
----------
stmt_data :
Statement content to be used to generate a feature matrix.
extra_evidence :
A list corresponding to the given list of statements, where
each entry is a list of Evidence objects providing additional
support for the corresponding statement (i.e., Evidences that
aren't already included in the Statement's own evidence list).
"""
# Call the prediction method of the internal sklearn model
stmt_arr = self.to_matrix(stmt_data, extra_evidence)
return self.model.predict_proba(stmt_arr, *args, **kwargs)
[docs] def predict(
self,
stmt_data: Union[np.ndarray, Sequence[Statement], pd.DataFrame],
extra_evidence: Optional[List[List[Evidence]]] = None,
*args,
**kwargs,
) -> np.ndarray:
"""Preprocess stmt data and run sklearn model `predict` method.
Additional `args` and `kwargs` are passed to the `predict` method of
the wrapped sklearn model.
Parameters
----------
stmt_data :
Statement content to be used to generate a feature matrix.
extra_evidence :
A list corresponding to the given list of statements, where
each entry is a list of Evidence objects providing additional
support for the corresponding statement (i.e., Evidences that
aren't already included in the Statement's own evidence list).
"""
stmt_arr = self.to_matrix(stmt_data, extra_evidence)
return self.model.predict(stmt_arr, *args, **kwargs)
[docs] def predict_log_proba(
self,
stmt_data: Union[np.ndarray, Sequence[Statement], pd.DataFrame],
extra_evidence: Optional[List[List[Evidence]]] = None,
*args,
**kwargs,
) -> np.ndarray:
"""Preprocess stmt data and run sklearn model `predict_log_proba`.
Additional `args` and `kwargs` are passed to the `predict` method of
the wrapped sklearn model.
Parameters
----------
stmt_data :
Statement content to be used to generate a feature matrix.
extra_evidence :
A list corresponding to the given list of statements, where
each entry is a list of Evidence objects providing additional
support for the corresponding statement (i.e., Evidences that
aren't already included in the Statement's own evidence list).
"""
stmt_arr = self.to_matrix(stmt_data, extra_evidence)
return self.model.predict_log_proba(stmt_arr, *args, **kwargs)
[docs]class CountsScorer(SklearnScorer):
"""Belief model learned from evidence counts and other stmt properties.
If using a DataFrame for Statement data, it should have the following
columns:
* `stmt_type`
* `source_counts`
Alternatively, if the DataFrame doesn't have a `source_counts` column, it
should have columns with names matching the sources in `self.source_list`.
Parameters
----------
model :
Any instance of a classifier object supporting the methods `fit`,
`predict_proba`, `predict`, and `predict_log_proba`.
source_list :
List of strings denoting the evidence sources (evidence.source_api
values) to be used for prediction.
include_more_specific :
If True, will add extra columns to the statement data matrix for the
source counts drawn from more specific evidences; if use_num_pmids is
True, will also add an additional column for the number of PMIDs from
more specific evidences. If False, these columns will not be included
even if the `extra_evidence` argument is passed to the
`stmts_to_matrix` method. This is to ensure that the featurization of
statements is consistent between training and prediction.
use_stmt_type :
Whether to include statement type as a feature.
use_num_members :
Whether to include a feature denoting the number of members of the
statement. Primarily for stratifying belief predictions about Complex
statements with more than two members. Cannot be used for statement
data passed in as a DataFrame.
use_num_pmids :
Whether to include a feature for the total number of unique PMIDs
supporting each statement. Cannot be used for statement passed in as a
DataFrame.
use_promoter :
Whether to include a feature giving the fraction of evidence (0 to 1)
containing the (case-insensitive) word "promoter". Tends to improve
misclassification of Complex statements that actually refer to
protein-DNA binding.
use_avg_evidence_len :
Whether to include a feature giving the average evidence sentence
length (in space-separated tokens).
use_residue_position :
Whether to include a feature indicating that a Statement has a
(not-None) residue and position (i.e., for Modification Statements).
When used to train and predict on site-mapped Statements, allows
the correspondence between the residue/position and the target
substrate to be exploited in predicting overall correctness.
Example
-------
.. code-block:: python
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
all_stmt_sources = CountsScorer.get_all_sources(stmts)
scorer = CountsScorer(clf, all_stmt_sources, use_stmt_type=True,
use_num_pmids=True)
scorer.fit(stmts, y_arr)
be = BeliefEngine(scorer)
be.set_hierarchy_probs(stmts)
"""
def __init__(
self,
model: BaseEstimator,
source_list: List[str],
include_more_specific: bool = False,
use_stmt_type: bool = False,
use_num_members: bool = False,
use_num_pmids: bool = False,
use_promoter: bool = False,
use_avg_evidence_len: bool = False,
use_residue_position: bool = False,
):
# Call superclass constructor to store the model
super(CountsScorer, self).__init__(model)
self.source_list = source_list
self.include_more_specific = include_more_specific
self.use_stmt_type = use_stmt_type
self.use_num_members = use_num_members
self.use_num_pmids = use_num_pmids
self.use_promoter = use_promoter
self.use_avg_evidence_len = use_avg_evidence_len
self.use_residue_position = use_residue_position
# Build dictionary mapping INDRA Statement types to integers
if use_stmt_type:
all_stmt_types = get_all_descendants(Statement)
self.stmt_type_map = {t.__name__: ix
for ix, t in enumerate(all_stmt_types)}
[docs] @staticmethod
def get_all_sources(
stmts: Sequence[Statement],
include_more_specific: bool = True,
include_less_specific: bool = True,
) -> List[str]:
"""Get a list of all the source_apis supporting the given statements.
Useful for determining the set of sources to be used for fitting
and prediction.
Parameters
----------
stmts :
A list of INDRA Statements to collect source APIs for.
include_more_specific :
If True (default), then includes the source APIs for the more
specific statements in the `supports` attribute of each statement.
include_less_specific :
If True (default), then includes the source APIs for the less
specific statements in the `supported_by` attribute of each
statement.
Returns
-------
:
A list of (unique) source_apis found in the set of statements.
"""
stmt_sources = set([ev.source_api for s in stmts for ev in s.evidence])
if include_more_specific:
stmt_sources.update([ev.source_api
for stmt in stmts
for supp_stmt in stmt.supports
for ev in supp_stmt.evidence])
if include_less_specific:
stmt_sources.update([ev.source_api
for stmt in stmts
for supp_by_stmt in stmt.supported_by
for ev in supp_by_stmt.evidence])
return list(stmt_sources)
[docs] def stmts_to_matrix(
self,
stmts: Sequence[Statement],
extra_evidence: Optional[List[List[Evidence]]] = None,
) -> np.ndarray:
"""Convert a list of Statements to a feature matrix.
Features are encoded as follows:
* One column for every source listed in `self.source_list`, containing
the number of statement evidences from that source. If
`self.include_more_specific` is True and `extra_evidence` is
provided, these are used in combination with the Statement's own
evidence in determining source counts.
* If `self.use_stmt_type` is set, statement type is included via
one-hot encoding, with one column for each statement type.
* If `self.use_num_members` is set, a column is added for the number
of agents in the Statement.
* If `self.use_num_pmids` is set, a column is added with the total
total number of unique PMIDs supporting the Statement. If
`extra_evidence` is provided, these are used in combination with the
Statement's own evidence in determining the number of PMIDs.
Parameters
----------
stmts :
A list or tuple of INDRA Statements to be used to generate a
feature matrix.
extra_evidence :
A list corresponding to the given list of statements, where
each entry is a list of Evidence objects providing additional
support for the corresponding statement (i.e., Evidences that
aren't already included in the Statement's own evidence list).
Returns
-------
:
Feature matrix for the statement data.
"""
# Check arguments for including more specific evidences
if self.include_more_specific and extra_evidence is None:
logger.info("CountScorer is set to include_more_specific "
"evidences but no extra_evidence was included.")
extra_evidence = [[] for stmt in stmts]
elif not self.include_more_specific and extra_evidence is not None:
logger.warning("extra_evidence was included but CountScorer "
"instance is not set to include_more_specific "
"evidences so extra_evidence will be ignored.")
# Check our list of extra evidences
check_extra_evidence(extra_evidence, len(stmts))
# Add categorical features and collect source_apis
cat_features = []
stmt_sources = set()
for ix, stmt in enumerate(stmts):
# Collect all source_apis from stmt evidences
dir_pmids = set()
promoter_ct = 0
evidence_lens = []
for ev in stmt.evidence:
stmt_sources.add(ev.source_api)
dir_pmids.add(ev.pmid)
if ev.text is not None:
evidence_lens.append(len(ev.text.split()))
if 'promoter' in ev.text.lower():
promoter_ct += 1
indir_pmids = set()
if self.include_more_specific and extra_evidence:
for ev in extra_evidence[ix]:
stmt_sources.add(ev.source_api)
indir_pmids.add(ev.pmid)
# Collect non-source count features (e.g. type) from stmts
feature_row: List[Any] = [] # Appease the Type Hint Gods
# One-hot encoding of stmt type
if self.use_stmt_type:
stmt_type_ix = self.stmt_type_map[type(stmt).__name__]
type_features = [1 if ix == stmt_type_ix else 0
for ix in range(len(self.stmt_type_map))]
feature_row.extend(type_features)
if self.use_residue_position:
if (isinstance(stmt, Modification) and stmt.residue and
stmt.position):
has_res_pos = True
else:
has_res_pos = False
feature_row.append(has_res_pos)
# Add field for number of members
if self.use_num_members:
feature_row.append(len(stmt.agent_list()))
# Add field with number of unique PMIDs
if self.use_num_pmids:
feature_row.append(len(dir_pmids))
if self.include_more_specific and extra_evidence:
feature_row.append(len(indir_pmids))
# Add a field specifying the percentage of evidences containing
# the word "promoter":
if self.use_promoter:
promoter_pct = promoter_ct / len(stmt.evidence) \
if len(stmt.evidence) > 0 else 0
feature_row.append(promoter_pct)
# Add a field giving length of the sentence in words
if self.use_avg_evidence_len:
avg_evidence_len = np.mean(evidence_lens) if evidence_lens \
else 0
feature_row.append(avg_evidence_len)
# Only add a feature row if we're using some of the features.
if feature_row:
cat_features.append(feature_row)
# Before proceeding, check whether all source_apis are in
# source_list
if stmt_sources.difference(set(self.source_list)):
logger.info("source_list does not include all source_apis "
"in the statement data.")
# Get source count features
# If we have extra_evidence, we double the source count features
if self.include_more_specific:
num_cols = len(self.source_list) * 2
else:
num_cols = len(self.source_list)
num_rows = len(stmts)
x_arr = np.zeros((num_rows, num_cols))
for stmt_ix, stmt in enumerate(stmts):
# Source from the stmt itself
direct_sources = [ev.source_api for ev in stmt.evidence]
dsrc_ctr = Counter(direct_sources)
for src_ix, src in enumerate(self.source_list):
x_arr[stmt_ix, src_ix] = dsrc_ctr.get(src, 0)
# Get indirect evidences
if self.include_more_specific and extra_evidence:
indirect_sources = [ev.source_api
for ev in extra_evidence[stmt_ix]]
idsrc_ctr = Counter(indirect_sources)
for src_ix, src in enumerate(self.source_list):
x_arr[stmt_ix, src_ix + len(self.source_list)] = \
idsrc_ctr.get(src, 0)
# If we have any categorical features, turn them into an array and
# add them to matrix
if cat_features:
cat_arr = np.array(cat_features)
x_arr = np.hstack((x_arr, cat_arr))
return x_arr
[docs] def df_to_matrix(
self,
df: pd.DataFrame,
) -> np.ndarray:
"""Convert a DataFrame of statement data to a feature matrix.
Based on information available in a DataFrame of statement data, this
implementation uses only source counts and statement type in building a
feature matrix, and will raise a ValueError if either
`self.use_num_members` or `self.use_num_pmids` is set.
Features are encoded as follows:
* One column for every source listed in `self.source_list`, containing
the number of statement evidences from that source. If
`extra_evidence` is provided, these are used in combination with the
Statement's own evidence in determining source counts.
* If `self.use_stmt_type` is set, statement type is included via
one-hot encoding, with one column for each statement type.
Parameters
----------
df :
A pandas DataFrame with statement metadata. It should have columns
`stmt_type` and `source_counts`; alternatively, if it doesn't have
a `source_counts` column, it should have columns with names
matching the sources in `self.source_list`.
Returns
-------
:
Feature matrix for the statement data.
"""
required_cols = {'stmt_type'}
# Currently, statement DataFrames are not expected to contain
# number of members or num_pmids as a data column, hence we raise a
# ValueError if either of these are set
if self.use_num_members:
raise ValueError('use_num_members not supported for statement '
'DataFrames.')
if self.use_num_pmids:
raise ValueError('use_num_pmids not supported for statement '
'DataFrames.')
# Make sure that the dataframe contains at least all of the above
# columns
if not required_cols.issubset(set(df.columns)):
raise ValueError('Statement DataFrame is missing required '
'columns.')
# Check for the source_counts column. If it's there, we're good
if 'source_counts' in df.columns:
has_sc_col = True
# If it's not, make sure that we have columns named for sources in
# self.source_list:
else:
has_sc_col = False
for source in self.source_list:
if source not in df.columns:
raise ValueError(f'Expected column "{source}" not in the '
'given statement DataFrame')
# Add categorical features and collect source_apis
cat_features = []
stmt_sources = set()
# For every statement entry in the dataframe...
for rowtup in df.itertuples():
# Collect statement sources
# ...if there's a source_counts col with dicts
if has_sc_col:
stmt_sources |= set(rowtup.source_counts.keys())
# Collect non-source count features (e.g. type) from stmts
feature_row = []
# One-hot encoding of stmt type
if self.use_stmt_type:
stmt_type_ix = self.stmt_type_map[rowtup.stmt_type]
type_features = [1 if ix == stmt_type_ix else 0
for ix in range(len(self.stmt_type_map))]
feature_row.extend(type_features)
# Only add a feature row if we're using some of the features.
if feature_row:
cat_features.append(feature_row)
# Before proceeding, check whether all source_apis are in
# source_list. If we don't have a source_counts dict, we don't look
# for columns beyond the sources in the source list, and we are
# guaranteed to have all of them because of the check performed above
source_diff = stmt_sources.difference(set(self.source_list))
if has_sc_col and source_diff:
logger.warning("source_list does not include all source_apis "
f"in the statement data: {str(source_diff)}")
# Get source count features
num_cols = len(self.source_list)
num_rows = len(df)
x_arr = np.zeros((num_rows, num_cols))
for stmt_ix, rowtup in enumerate(df.itertuples()):
for src_ix, src in enumerate(self.source_list):
# Get counts from the source_count dictionary
if has_sc_col:
x_arr[stmt_ix, src_ix] = rowtup.source_counts.get(src, 0)
# ...or get counts from named source column
else:
x_arr[stmt_ix, src_ix] = rowtup._asdict()[src]
# If we have any categorical features, turn them into an array and
# add them to matrix
if cat_features:
cat_arr = np.array(cat_features)
x_arr = np.hstack((x_arr, cat_arr))
return x_arr
[docs]class HybridScorer(BeliefScorer):
"""Use CountsScorer for known sources, SimpleScorer priors for any others.
Allows the use of a CountsScorer to make belief predictions based on
sources seen in training data, while falling back to SimpleScorer
priors for any sources not accounted for by the CountsScorer.
Like the SimpleScorer, uses an independence assumption to combine
beliefs from the two scorers (i.e., `hybrid_bel = 1 - (1 - cs_bel) *
(1 - ss_bel)`).
Parameters
----------
counts_scorer :
Instance of CountsScorer.
simple_scorer :
Instance of SimpleScorer.
"""
def __init__(
self,
counts_scorer: CountsScorer,
simple_scorer: SimpleScorer,
):
self.counts_scorer = counts_scorer
self.simple_scorer = simple_scorer
[docs] def check_prior_probs(
self,
statements: Sequence[Statement],
) -> None:
"""Check that sources in the set of statements are accounted for."""
# Get all sources for the set of statements
sources = CountsScorer.get_all_sources(statements,
include_more_specific=True)
non_cs_sources = set(sources).difference(
set(self.counts_scorer.source_list))
return self.simple_scorer._check_sources(non_cs_sources)
[docs] def score_statements(
self,
statements: Sequence[Statement],
extra_evidence: Optional[List[List[Evidence]]] = None,
) -> Sequence[float]:
"""
Parameters
----------
statements :
INDRA Statements whose belief scores are to be calculated.
extra_evidence :
A list corresponding to the given list of statements, where
each entry is a list of Evidence objects providing additional
support for the corresponding statement (i.e., Evidences that
aren't already included in the Statement's own evidence list).
Returns
-------
:
The computed probabilities for each statement.
"""
# Get beliefs from the sklearn model, using the sources in the
# CountScorer source_list as features
skl_beliefs = self.counts_scorer.predict_proba(statements,
extra_evidence)[:, 1]
skl_sources = self.counts_scorer.source_list
hybrid_beliefs = []
# Iterate over the statements...
for ix, stmt in enumerate(statements):
# ...get both the statement's own evidence and the more-specific
# (extra) evidences
all_evidence = get_stmt_evidence(stmt, ix, extra_evidence)
# Next, filter out any evidences that have sources in the skl
# model source list, leaving behind the rest. At the same time,
# record whether we've found any sources the skl model source list.
filt_evidence = []
has_skl_source = False
for ev in all_evidence:
if ev.source_api in skl_sources:
has_skl_source = True
else:
filt_evidence.append(ev)
# Get the simple belief
simple_bel = self.simple_scorer.score_evidence_list(filt_evidence)
# Calculate hybrid belief: the probability that all sources, both
# those evaluated by the sklearn model and the simplescorer, are
# not jointly incorrect. If there are no sources from the skl
# model list, we set the skl belief to 0 so the probability comes
# only from the simple scorer
skl_bel = skl_beliefs[ix] if has_skl_source else 0
hybrid_bel = 1 - (1 - skl_bel) * (1 - simple_bel)
hybrid_beliefs.append(hybrid_bel)
return hybrid_beliefs