Source code for indra.literature.pmc_client

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import xml.etree.ElementTree as ET
import os.path
import requests
import logging
from indra.literature import pubmed_client
from indra.util import UnicodeXMLTreeBuilder as UTB

logger = logging.getLogger('pmc')

pmc_url = 'https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi'
pmid_convert_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'

# Paths to resource files
pmids_fulltext_path = os.path.join(os.path.dirname(__file__),
                                   'pmids_fulltext.txt')
pmids_oa_xml_path = os.path.join(os.path.dirname(__file__),
                                   'pmids_oa_xml.txt')
pmids_oa_txt_path = os.path.join(os.path.dirname(__file__),
                                   'pmids_oa_txt.txt')
pmids_auth_xml_path = os.path.join(os.path.dirname(__file__),
                                   'pmids_auth_xml.txt')
# Define global dict containing lists of PMIDs among mineable PMCs
# to be lazily initialized
pmids_fulltext_dict = {}

[docs]def id_lookup(paper_id, idtype=None): """This function takes a Pubmed ID, Pubmed Central ID, or DOI and use the Pubmed ID mapping service and looks up all other IDs from one of these. The IDs are returned in a dictionary.""" if idtype is not None and idtype not in ('pmid', 'pmcid', 'doi'): raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', " "or 'doi'." % idtype) if paper_id.upper().startswith('PMC'): idtype = 'pmcid' # Strip off any prefix if paper_id.upper().startswith('PMID'): paper_id = paper_id[4:] elif paper_id.upper().startswith('DOI'): paper_id = paper_id[3:] data = {'ids': paper_id} if idtype is not None: data['idtype'] = idtype try: tree = pubmed_client.send_request(pmid_convert_url, data) except Exception as e: logger.error('Error looking up PMID in PMC: %s' % e) return {} if tree is None: return {} record = tree.find('record') if record is None: return {} doi = record.attrib.get('doi') pmid = record.attrib.get('pmid') pmcid = record.attrib.get('pmcid') ids = {'doi': doi, 'pmid': pmid, 'pmcid': pmcid} return ids
def get_ids(search_term, retmax=1000): return pubmed_client.get_ids(search_term, retmax=retmax, db='pmc') def get_xml(pmc_id): if pmc_id.upper().startswith('PMC'): pmc_id = pmc_id[3:] # Request params params = {} params['verb'] = 'GetRecord' params['identifier'] = 'oai:pubmedcentral.nih.gov:%s' % pmc_id params['metadataPrefix'] = 'pmc' # Submit the request res = requests.get(pmc_url, params) if not res.status_code == 200: logger.warning("Couldn't download %s" % pmc_id) return None # Read the bytestream xml_bytes = res.content # Check for any XML errors; xml_str should still be bytes tree = ET.XML(xml_bytes, parser=UTB()) xmlns = "http://www.openarchives.org/OAI/2.0/" err_tag = tree.find('{%s}error' % xmlns) if err_tag is not None: err_code = err_tag.attrib['code'] err_text = err_tag.text logger.warning('PMC client returned with error %s: %s' % (err_code, err_text)) return None # If no error, return the XML as a unicode string else: return xml_bytes.decode('utf-8')
[docs]def filter_pmids(pmid_list, source_type): """Filter a list of PMIDs for ones with full text from PMC. Parameters ---------- pmid_list : list List of PMIDs to filter. source_type : string One of 'fulltext', 'oa_xml', 'oa_txt', or 'auth_xml'. Returns ------- list of PMIDs available in the specified source/format type. """ global pmids_fulltext_dict # Check args if source_type not in ('fulltext', 'oa_xml', 'oa_txt', 'auth_xml'): raise ValueError("source_type must be one of: 'fulltext', 'oa_xml', " "'oa_txt', or 'auth_xml'.") # Check if we've loaded this type, and lazily initialize if pmids_fulltext_dict.get(source_type) is None: fulltext_list_path = os.path.join(os.path.dirname(__file__), 'pmids_%s.txt' % source_type) with open(fulltext_list_path, 'rb') as f: fulltext_list = set([line.strip('\n').decode('utf-8') for line in f.readlines()]) pmids_fulltext_dict[source_type] = fulltext_list return list(set(pmid_list).intersection( pmids_fulltext_dict.get(source_type)))