"""
Search and get metadata for articles in Pubmed.
"""
from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import xml.etree.ElementTree as ET
import requests
import logging
# Python 3
try:
from functools import lru_cache
# Python 2
except ImportError:
from functools32 import lru_cache
from indra.databases import hgnc_client
from indra.util import UnicodeXMLTreeBuilder as UTB
logger = logging.getLogger('pubmed')
pubmed_search = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
pubmed_fetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
# Send request can't be cached by lru_cache because it takes a dict
# (a mutable/unhashable type) as an argument. We cache the callers instead.
def send_request(url, data):
res = requests.get(url, params=data)
if not res.status_code == 200:
return None
tree = ET.XML(res.content, parser=UTB())
return tree
@lru_cache(maxsize=100)
[docs]def get_ids(search_term, **kwargs):
"""Search Pubmed for paper IDs given a search term.
The options are passed as named arguments. For details on parameters that
can be used, see
https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch Some useful
parameters to pass are db='pmc' to search PMC instead of pubmed reldate=2
to search for papers within the last 2 days mindate='2016/03/01',
maxdate='2016/03/31' to search for papers in March 2016.
"""
params = {'term': search_term,
'retmax': 1000,
'retstart': 0,
'db': 'pubmed',
'sort': 'pub+date'}
params.update(kwargs)
tree = send_request(pubmed_search, params)
if tree is None:
return []
if tree.find('ERROR') is not None:
logger.error(tree.find('ERROR').text)
return []
count = int(tree.find('Count').text)
id_terms = tree.findall('IdList/Id')
if id_terms is None:
return []
ids = [idt.text for idt in id_terms]
if count != len(ids):
logger.warning('Not all ids were retrieved for search %s;\n'
'limited at %d.' % (search_term, params['retmax']))
return ids
@lru_cache(maxsize=100)
[docs]def get_ids_for_gene(hgnc_name, **kwargs):
"""Get the curated set of articles for a gene in the Entrez database.
Search parameters for the Gene database query can be passed in as
keyword arguments.
Parameters
----------
hgnc_name : string
The HGNC name of the gene. This is used to obtain the HGNC ID
(using the hgnc_client module) and in turn used to obtain the Entrez
ID associated with the gene. Entrez is then queried for that ID.
"""
# Get the HGNC ID for the HGNC name
hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
if hgnc_id is None:
raise ValueError('Invalid HGNC name.')
# Get the Entrez ID
entrez_id = hgnc_client.get_entrez_id(hgnc_id)
if entrez_id is None:
raise ValueError('Entrez ID not found in HGNC table.')
# Query the Entrez Gene database
params = {'db': 'gene',
'retmode': 'xml',
'id': entrez_id}
params.update(kwargs)
tree = send_request(pubmed_fetch, params)
if tree is None:
return []
if tree.find('ERROR') is not None:
logger.error(tree.find('ERROR').text)
return []
# Get all PMIDs from the XML tree
id_terms = tree.findall('.//PubMedId')
if id_terms is None:
return []
# Use a set to remove duplicate IDs
ids = list(set([idt.text for idt in id_terms]))
return ids
@lru_cache(maxsize=100)
[docs]def get_article_xml(pubmed_id):
"""Get the XML metadata for a single article from the Pubmed database.
"""
if pubmed_id.upper().startswith('PMID'):
pubmed_id = pubmed_id[4:]
params = {'db': 'pubmed',
'retmode': 'xml',
'id': pubmed_id}
tree = send_request(pubmed_fetch, params)
if tree is None:
return None
article = tree.find('PubmedArticle/MedlineCitation/Article')
return article # May be none
[docs]def get_title(pubmed_id):
"""Get the title of an article in the Pubmed database."""
article = get_article_xml(pubmed_id)
if article is None:
return None
title = article.find('ArticleTitle').text
return title
[docs]def get_abstract(pubmed_id, prepend_title=True):
"""Get the abstract of an article in the Pubmed database."""
article = get_article_xml(pubmed_id)
if article is None:
return None
abstract = article.findall('Abstract/AbstractText')
if abstract is None:
return None
abstract_text = ' '.join([' ' if abst.text is None
else abst.text for abst in abstract])
title_tag = article.find('ArticleTitle')
if title_tag is not None and prepend_title:
title = title_tag.text
if title is not None:
if not title.endswith('.'):
title += '.'
abstract_text = title + ' ' + abstract_text
return abstract_text
@lru_cache(maxsize=1000)
[docs]def get_issns_for_journal(nlm_id):
"""Get a list of the ISSN numbers for a journal given its NLM ID.
Structure of the XML output returned by the NLM Catalog query::
NLMCatalogRecordSet
NLMCatalogRecord
NlmUniqueID
DateCreated
DateRevised
DateAuthorized
DateCompleted
DateRevisedMajor
TitleMain
MedlineTA
TitleAlternate +
AuthorList
ResourceInfo
TypeOfResource
Issuance
ResourceUnit
PublicationTypeList
PublicationInfo
Country
PlaceCode
Imprint
PublicationFirstYear
PublicationEndYear
Language
PhysicalDescription
IndexingSourceList
IndexingSource
IndexingSourceName
Coverage
GeneralNote +
LocalNote
MeshHeadingList
Classification
ELocationList
LCCN
ISSN +
ISSNLinking
Coden
OtherID +
"""
params = {'db': 'nlmcatalog',
'retmode': 'xml',
'id': nlm_id}
tree = send_request(pubmed_fetch, params)
if tree is None:
return None
issn_list = tree.findall('.//ISSN')
issn_linking = tree.findall('.//ISSNLinking')
issns = issn_list + issn_linking
# No ISSNs found!
if not issns:
return None
else:
return [issn.text for issn in issns]
[docs]def expand_pagination(pages):
"""Convert a page number to long form, e.g., from 456-7 to 456-457."""
# If there is no hyphen, it's a single page, and we're good to go
parts = pages.split('-')
if len(parts) == 1: # No hyphen, so no split
return pages
elif len(parts) == 2:
start = parts[0]
end = parts[1]
# If the end is the same number of digits as the start, then we
# don't change anything!
if len(start) == len(end):
return pages
# Otherwise, replace the last digits of start with the digits of end
num_end_digits = len(end)
new_end = start[:-num_end_digits] + end
return '%s-%s' % (start, new_end)
else: # More than one hyphen, something weird happened
logger.warning("Multiple hyphens in page number: %s" % pages)
return pages
"""
Note
----
Structure of the XML output returned by queries to Pubmed database::
PubmedArticleSet
PubmedArticle
MedlineCitation
PMID
DateCreated
DateCompleted
DateRevised
MedlineJournalInfo
Country
MedlineTA
NlmUniqueID
ISSNLinking
ChemicalList
CitationSubset
CommentsCorrectionsList
MeshHeadingList
OtherID
Article
Journal
ISSN
JournalIssue
Title
ISOAbbreviation
ArticleTitle
Pagination
MedlinePgn
ELocationID
Abstract
AuthorList
Author
LastName
ForeName
Initials
AffiliationInfo
Language
PublicationTypeList
PublicationType
ArticleDate
PubmedData
History
PublicationStatus
ArticleIdList
"""