import csv
import logging
from itertools import groupby
from collections import Counter
from indra.databases import uniprot_client
from indra.util import write_unicode_csv
logger = logging.getLogger(__name__)
# Some useful functions for analyzing the grounding of sets of statements
# Put together all agent texts along with their grounding
[docs]def all_agents(stmts):
"""Return a list of all of the agents from a list of statements.
Only agents that are not None and have a TEXT entry are returned.
Parameters
----------
stmts : list of :py:class:`indra.statements.Statement`
Returns
-------
agents : list of :py:class:`indra.statements.Agent`
List of agents that appear in the input list of indra statements.
"""
agents = []
for stmt in stmts:
for agent in stmt.agent_list():
# Agents don't always have a TEXT db_refs entry (for instance
# in the case of Statements from databases) so we check for this.
if agent is not None and agent.db_refs.get('TEXT') is not None:
agents.append(agent)
return agents
[docs]def agent_texts(agents):
"""Return a list of all agent texts from a list of agents.
None values are associated to agents without agent texts
Parameters
----------
agents : list of :py:class:`indra.statements.Agent`
Returns
-------
list of str/None
agent texts from input list of agents
"""
return [ag.db_refs.get('TEXT') for ag in agents]
[docs]def get_sentences_for_agent(text, stmts, max_sentences=None):
"""Returns evidence sentences with a given agent text from a list of
statements.
Parameters
----------
text : str
An agent text
stmts : list of :py:class:`indra.statements.Statement`
INDRA Statements to search in for evidence statements.
max_sentences : Optional[int/None]
Cap on the number of evidence sentences to return. Default: None
Returns
-------
sentences : list of str
Evidence sentences from the list of statements containing
the given agent text.
"""
sentences = []
for stmt in stmts:
for agent in stmt.agent_list():
if agent is not None and agent.db_refs.get('TEXT') == text:
sentences.append((stmt.evidence[0].pmid,
stmt.evidence[0].text))
if max_sentences is not None and \
len(sentences) >= max_sentences:
return sentences
return sentences
[docs]def agent_texts_with_grounding(stmts):
"""Return agent text groundings in a list of statements with their counts
Parameters
----------
stmts: list of :py:class:`indra.statements.Statement`
Returns
-------
list of tuple
List of tuples of the form
(text: str, ((name_space: str, ID: str, count: int)...),
total_count: int)
Where the counts within the tuple of groundings give the number of
times an agent with the given agent_text appears grounded with the
particular name space and ID. The total_count gives the total number
of times an agent with text appears in the list of statements.
"""
allag = all_agents(stmts)
# Convert PFAM-DEF lists into tuples so that they are hashable and can
# be tabulated with a Counter
for ag in allag:
pfam_def = ag.db_refs.get('PFAM-DEF')
if pfam_def is not None:
ag.db_refs['PFAM-DEF'] = tuple(pfam_def)
refs = [tuple(ag.db_refs.items()) for ag in allag]
refs_counter = Counter(refs)
refs_counter_dict = [(dict(entry[0]), entry[1])
for entry in refs_counter.items()]
# First, sort by text so that we can do a groupby
refs_counter_dict.sort(key=lambda x: x[0].get('TEXT'))
# Then group by text
grouped_by_text = []
for k, g in groupby(refs_counter_dict, key=lambda x: x[0].get('TEXT')):
# Total occurrences of this agent text
total = 0
entry = [k]
db_ref_list = []
for db_refs, count in g:
# Check if TEXT is our only key, indicating no grounding
if list(db_refs.keys()) == ['TEXT']:
db_ref_list.append((None, None, count))
# Add any other db_refs (not TEXT)
for db, db_id in db_refs.items():
if db == 'TEXT':
continue
else:
db_ref_list.append((db, db_id, count))
total += count
# Sort the db_ref_list by the occurrences of each grounding
entry.append(tuple(sorted(db_ref_list, key=lambda x: x[2],
reverse=True)))
# Now add the total frequency to the entry
entry.append(total)
# And add the entry to the overall list
grouped_by_text.append(tuple(entry))
# Sort the list by the total number of occurrences of each unique key
grouped_by_text.sort(key=lambda x: x[2], reverse=True)
return grouped_by_text
# List of all ungrounded entities by number of mentions
[docs]def ungrounded_texts(stmts):
"""Return a list of all ungrounded entities ordered by number of mentions
Parameters
----------
stmts : list of :py:class:`indra.statements.Statement`
Returns
-------
ungroundc : list of tuple
list of tuples of the form (text: str, count: int) sorted in descending
order by count.
"""
ungrounded = [ag.db_refs['TEXT']
for s in stmts
for ag in s.agent_list()
if ag is not None and list(ag.db_refs.keys()) == ['TEXT']]
ungroundc = Counter(ungrounded)
ungroundc = ungroundc.items()
ungroundc = sorted(ungroundc, key=lambda x: x[1], reverse=True)
return ungroundc
[docs]def get_agents_with_name(name, stmts):
"""Return all agents within a list of statements with a particular name."""
return [ag for stmt in stmts for ag in stmt.agent_list()
if ag is not None and ag.name == name]
[docs]def save_base_map(filename, grouped_by_text):
"""Dump a list of agents along with groundings and counts into a csv file
Parameters
----------
filename : str
Filepath for output file
grouped_by_text : list of tuple
List of tuples of the form output by agent_texts_with_grounding
"""
rows = []
for group in grouped_by_text:
text_string = group[0]
for db, db_id, count in group[1]:
if db == 'UP':
name = uniprot_client.get_mnemonic(db_id)
else:
name = ''
row = [text_string, db, db_id, count, name]
rows.append(row)
write_unicode_csv(filename, rows, delimiter=',', quotechar='"',
quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
[docs]def protein_map_from_twg(twg):
"""Build map of entity texts to validate protein grounding.
Looks at the grounding of the entity texts extracted from the statements
and finds proteins where there is grounding to a human protein that maps to
an HGNC name that is an exact match to the entity text. Returns a dict that
can be used to update/expand the grounding map.
Parameters
----------
twg : list of tuple
list of tuples of the form output by agent_texts_with_grounding
Returns
-------
protein_map : dict
dict keyed on agent text with associated values
{'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts
where the grounding map was able to find human protein grounded to
this agent_text in Uniprot.
"""
protein_map = {}
unmatched = 0
matched = 0
logger.info('Building grounding map for human proteins')
for agent_text, grounding_list, _ in twg:
# If 'UP' (Uniprot) not one of the grounding entries for this text,
# then we skip it.
if 'UP' not in [entry[0] for entry in grounding_list]:
continue
# Otherwise, collect all the Uniprot IDs for this protein.
uniprot_ids = [entry[1] for entry in grounding_list
if entry[0] == 'UP']
# For each Uniprot ID, look up the species
for uniprot_id in uniprot_ids:
# If it's not a human protein, skip it
mnemonic = uniprot_client.get_mnemonic(uniprot_id)
if mnemonic is None or not mnemonic.endswith('_HUMAN'):
continue
# Otherwise, look up the gene name in HGNC and match against the
# agent text
gene_name = uniprot_client.get_gene_name(uniprot_id)
if gene_name is None:
unmatched += 1
continue
if agent_text.upper() == gene_name.upper():
matched += 1
protein_map[agent_text] = {'TEXT': agent_text,
'UP': uniprot_id}
else:
unmatched += 1
logger.info('Exact matches for %d proteins' % matched)
logger.info('No match (or no gene name) for %d proteins' % unmatched)
return protein_map
[docs]def save_sentences(twg, stmts, filename, agent_limit=300):
"""Write evidence sentences for stmts with ungrounded agents to csv file.
Parameters
----------
twg: list of tuple
list of tuples of ungrounded agent_texts with counts of the
number of times they are mentioned in the list of statements.
Should be sorted in descending order by the counts.
This is of the form output by the function ungrounded texts.
stmts: list of :py:class:`indra.statements.Statement`
filename : str
Path to output file
agent_limit : Optional[int]
Number of agents to include in output file. Takes the top agents
by count.
"""
sentences = []
unmapped_texts = [t[0] for t in twg]
counter = 0
logger.info('Getting sentences for top %d unmapped agent texts.' %
agent_limit)
for text in unmapped_texts:
agent_sentences = get_sentences_for_agent(text, stmts)
sentences += map(lambda tup: (text,) + tup, agent_sentences)
counter += 1
if counter >= agent_limit:
break
# Write sentences to CSV file
write_unicode_csv(filename, sentences, delimiter=',', quotechar='"',
quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')