Source code for indra.literature.pubmed_client

"""
Search and get metadata for articles in Pubmed.
"""
import csv
import glob
import gzip
import os
import re
from datetime import datetime

import time
from concurrent.futures import ThreadPoolExecutor
from io import StringIO

import tqdm
import logging
import random
import subprocess
import requests
from time import sleep
from typing import List, Dict, Optional
from pathlib import Path
from functools import lru_cache
import xml.etree.ElementTree as ET
from lxml import etree as lxml_etree
from indra.resources import RESOURCES_PATH
from indra.util import UnicodeXMLTreeBuilder as UTB
from indra.util import batch_iter, pretty_save_xml


logger = logging.getLogger(__name__)

pubmed_search = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
pubmed_fetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
pubmed_archive = "https://ftp.ncbi.nlm.nih.gov/pubmed"
pubmed_archive_baseline = pubmed_archive + "/baseline/"
pubmed_archive_update = pubmed_archive + "/updatefiles/"
# As of 2026-04-13, PMC moved these legacy FTP files under /deprecated/
# and the service is slated for full removal in August 2026. A follow-up
# migration to the PMC Cloud S3 bucket (s3://pmc-oa-opendata/) is planned.
pmc_ftp_base_url = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/deprecated"
pmid_to_pmc_download_url = f"{pmc_ftp_base_url}/oa_file_list.csv"
RETRACTIONS_FILE = os.path.join(RESOURCES_PATH, "pubmed_retractions.tsv")


# Send request can't be cached by lru_cache because it takes a dict
# (a mutable/unhashable type) as an argument. We cache the callers instead.
def send_request(url, data, retry_pause=1, max_tries=3):
    try:
        res = requests.get(url, params=data)
    except requests.exceptions.Timeout as e:
        logger.error('PubMed request timed out')
        logger.error('url: %s, data: %s' % (url, data))
        logger.error(e)
        return None
    except requests.exceptions.RequestException as e:
        logger.error('PubMed request exception')
        logger.error('url: %s, data: %s' % (url, data))
        logger.error(e)
        return None
    if res.status_code in {400, 429, 502, 503} and max_tries > 0:
        sleep(retry_pause)
        # Increase the sleep time at random to avoid multiple clients
        # retrying at the same time for e.g. tests
        retry_pause += 0.5 + 1.5 * random.random()
        return send_request(url, data, retry_pause, max_tries - 1)
    if not res.status_code == 200:
        logger.error('Got return code %d from pubmed client.'
                     % res.status_code)
        return None
    tree = ET.XML(res.content, parser=UTB())
    return tree



[docs]
@lru_cache(maxsize=100)
def get_ids(search_term, **kwargs):
    """Search Pubmed for paper IDs given a search term.

    Search options can be passed as keyword arguments, some of which are
    custom keywords identified by this function, while others are passed on
    as parameters for the request to the PubMed web service
    For details on parameters that can be used in PubMed searches, see
    https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch Some useful
    parameters to pass are db='pmc' to search PMC instead of pubmed reldate=2
    to search for papers within the last 2 days mindate='2016/03/01',
    maxdate='2016/03/31' to search for papers in March 2016.

    PubMed, by default, limits returned PMIDs to a small number, and this
    number can be controlled by the "retmax" parameter. This function
    uses a retmax value of 10,000 by default (the maximum supported by PubMed)
    that can be changed via the corresponding keyword argument. Note also
    the retstart argument along with retmax to page across batches of IDs.

    PubMed's REST API makes it difficult to retrieve more than 10k
    PMIDs systematically. See the `get_all_ids` function in this module
    for a way to retrieve more than 10k IDs using the PubMed edirect CLI.

    Parameters
    ----------
    search_term : str
        A term for which the PubMed search should be performed.
    use_text_word : Optional[bool]
        If True, the "[tw]" string is appended to the search term to constrain
        the search to "text words", that is words that appear as whole
        in relevant parts of the PubMed entry (excl. for instance the journal
        name or publication date) like the title and abstract. Using this
        option can eliminate spurious search results such as all articles
        published in June for a search for the "JUN" gene, or journal names
        that contain Acad for a search for the "ACAD" gene.
        See also: https://www.nlm.nih.gov/bsd/disted/pubmedtutorial/020_760.html
        Default : True
    kwargs : kwargs
        Additional keyword arguments to pass to the PubMed search as
        parameters.
    """
    use_text_word = kwargs.pop('use_text_word', True)
    if use_text_word:
        search_term += '[tw]'
    params = {'term': search_term,
              'retmax': 10000,
              'retstart': 0,
              'db': 'pubmed',
              'sort': 'pub+date'}
    params.update(kwargs)
    tree = send_request(pubmed_search, params)
    if tree is None:
        return []
    if tree.find('ERROR') is not None:
        logger.error(tree.find('ERROR').text)
        return []
    if tree.find('ErrorList') is not None:
        for err in tree.find('ErrorList'):
            logger.error('Error - %s: %s' % (err.tag, err.text))
        return []
    count = int(tree.find('Count').text)
    id_terms = tree.findall('IdList/Id')
    if id_terms is None:
        return []
    ids = [idt.text for idt in id_terms]
    if count != len(ids):
        logger.warning('Not all ids were retrieved for search %s;\n'
                       'limited at %d.' % (search_term, params['retmax']))
    return ids




[docs]
def get_id_count(search_term):
    """Get the number of citations in Pubmed for a search query.

    Parameters
    ----------
    search_term : str
        A term for which the PubMed search should be performed.

    Returns
    -------
    int or None
        The number of citations for the query, or None if the query fails.
    """
    params = {'term': search_term,
              'rettype': 'count',
              'db': 'pubmed'}
    tree = send_request(pubmed_search, params)
    if tree is None:
        return None
    else:
        count = list(tree)[0].text
        return int(count)




[docs]
@lru_cache(maxsize=100)
def get_ids_for_gene(hgnc_name, **kwargs):
    """Get the curated set of articles for a gene in the Entrez database.

    Search parameters for the Gene database query can be passed in as
    keyword arguments.

    Parameters
    ----------
    hgnc_name : str
        The HGNC name of the gene. This is used to obtain the HGNC ID
        (using the hgnc_client module) and in turn used to obtain the Entrez
        ID associated with the gene. Entrez is then queried for that ID.
    """
    from indra.databases import hgnc_client
    # Get the HGNC ID for the HGNC name
    hgnc_id = hgnc_client.get_hgnc_id(hgnc_name)
    if hgnc_id is None:
        raise ValueError('Invalid HGNC name.')
    # Get the Entrez ID
    entrez_id = hgnc_client.get_entrez_id(hgnc_id)
    if entrez_id is None:
        raise ValueError('Entrez ID not found in HGNC table.')
    # Query the Entrez Gene database
    params = {'db': 'gene',
              'retmode': 'xml',
              'id': entrez_id}
    params.update(kwargs)
    tree = send_request(pubmed_fetch, params)
    if tree is None:
        return []
    if tree.find('ERROR') is not None:
        logger.error(tree.find('ERROR').text)
        return []
    # Get all PMIDs from the XML tree
    id_terms = tree.findall('.//PubMedId')
    if id_terms is None:
        return []
    # Use a set to remove duplicate IDs
    ids = list(set([idt.text for idt in id_terms]))
    return ids




[docs]
def get_mesh_term_search_str(mesh_id, major_topic=False):
    """Return a search string for a given MeSH ID.

    Parameters
    ----------
    mesh_id : str
        The MeSH ID of a term to search for, e.g., D009101.
    major_topic : bool
        If True, the given MeSH ID is considered as a major topic.
        Default: False
    """
    from indra.databases import mesh_client
    mesh_name = mesh_client.get_mesh_name(mesh_id)
    if not mesh_name:
        logger.error('Could not get MeSH name for ID %s' % mesh_id)
        return None
    if mesh_id.startswith('C') and not major_topic:
        # Get pmids for supplementary concepts as well
        search_term = f'{mesh_name} [nm]'
        return search_term
    suffix = 'majr' if major_topic else 'mh'
    search_term = '%s [%s]' % (mesh_name, suffix)
    return search_term




[docs]
def get_ids_for_mesh(mesh_id, major_topic=False, **kwargs):
    """Return PMIDs that are annotated with a given MeSH ID.

    Parameters
    ----------
    mesh_id : str
        The MeSH ID of a term to search for, e.g., D009101.
    major_topic : bool
        If True, only papers for which the given MeSH ID is annotated as
        a major topic are returned. Otherwise all annotations are considered.
        Default: False
    **kwargs
        Any further PudMed search arguments that are passed to
        get_ids.
    """
    search_str = get_mesh_term_search_str(mesh_id, major_topic)
    ids = get_ids(search_str, use_text_word=False, **kwargs)
    return ids




[docs]
def get_ids_for_mesh_terms(mesh_terms, major_topics=None, **kwargs):
    """Return PMIDs that are annotated with a given list of MeSH terms.

    Parameters
    ----------
    mesh_terms : list of str
        A list of MeSH IDs of terms to search for, e.g., ['D009101', 'D009102'].
    major_topics : Optional[list of bool]
        A list of booleans indicating whether the corresponding MeSH term
        should be considered as a major topic. If None, all terms are considered
        as major topics.
    **kwargs
        Any further PudMed search arguments that are passed to
        get_ids.
    """
    if major_topics is None:
        major_topics = [False] * len(mesh_terms)
    search_strs = [get_mesh_term_search_str(mesh_id, major_topic)
                   for mesh_id, major_topic in zip(mesh_terms, major_topics)]
    search_strs = [s for s in search_strs if s is not None]
    if not search_strs:
        return []
    search_str = ' AND '.join([f'({s})' for s in search_strs])
    ids = get_ids(search_str, use_text_word=False, **kwargs)
    return ids



def _get_article_from_full_xml(full_xml_tree):
    if full_xml_tree is None:
        return None
    article = full_xml_tree.find('PubmedArticle/MedlineCitation/Article')
    return article



[docs]
def get_article_xml(pubmed_id):
    """Get the Article subtree a single article from the Pubmed database.

    Parameters
    ----------
    pubmed_id : str
        A PubMed ID.

    Returns
    -------
    xml.etree.ElementTree.Element
        The XML ElementTree Element that represents the Article portion of the
        PubMed entry.
    """
    full_xml_tree = get_full_xml(pubmed_id)
    article = _get_article_from_full_xml(full_xml_tree)
    return article  # May be none




[docs]
@lru_cache(maxsize=100)
def get_full_xml(pubmed_id, fname=None):
    """Get the full XML tree of a single article from the Pubmed database.

    Parameters
    ----------
    pubmed_id : str
        A PubMed ID.
    fname : Optional[str]
        If given, the XML is saved to the given file name.

    Returns
    -------
    xml.etree.ElementTree.Element
        The root element of the XML tree representing the PubMed entry.
        The root is a PubmedArticleSet with a single PubmedArticle element
        that contains the article metadata.
    """
    if pubmed_id.upper().startswith('PMID'):
        pubmed_id = pubmed_id[4:]
    params = {'db': 'pubmed',
              'retmode': 'xml',
              'id': pubmed_id}
    tree = send_request(pubmed_fetch, params)
    if fname:
        pretty_save_xml(tree, fname)
    return tree




[docs]
def get_full_xml_by_pmids(
    pubmed_ids: List[str],
    fname: Optional[str] = None
) -> ET.Element:
    """Get the full XML tree for multiple articles from PubMed using edirect CLI.

    Parameters
    ----------
    pubmed_ids : list[str]
        A list of PubMed IDs.
    fname : Optional[str]
        If given, the XML is saved to the given file name.

    Returns
    -------
    xml.etree.ElementTree.Element
        The root element of the XML tree representing the PubMed entries.
        The root is a PubmedArticleSet containing multiple PubmedArticle elements.

    Raises
    ------
    RuntimeError
        If the edirect CLI utilities are not installed or not found on PATH.

    Notes
    -----
    - This function requires the edirect command line utilities to be installed
      and visible on your PATH. See https://www.ncbi.nlm.nih.gov/books/NBK179288/
      for instructions.
    - Note that the output is sorted by PMID numerically e.g.,
      10, 11, 20, 22, 1000 (and not lexicographically e.g., 10, 1000, 11, 20, 22)
      without regard to the order in which the pmids are passed in.
    """
    # Have to use lxml.etree because the XML returned by efetch is not properly
    # formatted for ET.XML
    parser = lxml_etree.XMLParser(recover=True, encoding='utf-8')
    pmid_list = ','.join(pubmed_ids)
    cmd = f'efetch -db pubmed -id {pmid_list} -format xml'.split()
    try:
        xml_bytes = subprocess.check_output(cmd)
    except FileNotFoundError:
        # subprocess.check_output will raise FileNotFoundError if the command
        # is not found
        raise RuntimeError("The efetch utility could not be found. "
                           "This function only works if edirect is "
                           "installed and is visible on your PATH. "
                           "See https://www.ncbi.nlm.nih.gov/books/NBK179288/ "
                           "for instructions.")

    tree = lxml_etree.fromstring(xml_bytes, parser=parser)
    # Each article is in a <PubmedArticle> tag, encapsulated in a
    # <PubmedArticleSet> tag.
    if fname is not None:
        pretty_save_xml(tree, fname)
    return tree




[docs]
def get_title(pubmed_id):
    """Get the title of an article in the Pubmed database."""
    article = get_article_xml(pubmed_id)
    if article is None:
        return None
    return _get_title_from_article_element(article)



def _get_title_from_article_element(article):
    title_tag = article.find('ArticleTitle')
    title = None
    if title_tag is not None:
        title = title_tag.text
        if hasattr(title_tag, 'itertext'):
            title = ''.join(list(title_tag.itertext()))
    return title


def _abstract_from_article_element(article, prepend_title=False):
    abstract = article.findall('Abstract/AbstractText')
    if abstract is None:
        return None
    abstract_text = ' '.join(['' if not hasattr(abst, 'itertext')
                              else ' '.join(list(abst.itertext()))
                              for abst in abstract])
    if prepend_title:
        title = _get_title_from_article_element(article)
        if title is not None:
            if not title.endswith('.'):
                title += '.'
            abstract_text = title + ' ' + abstract_text

    return abstract_text



[docs]
def get_abstract(pubmed_id, prepend_title=True):
    """Get the abstract of an article in the Pubmed database."""
    article = get_article_xml(pubmed_id)
    if article is None:
        return None
    return _abstract_from_article_element(article, prepend_title)



# A function to get the text for the element, or None if not found
def _find_elem_text(root, xpath_string):
    elem = root.find(xpath_string)
    return None if elem is None else elem.text


def _get_issue_info(journal: ET.Element):
    # Issue info
    issue = journal.find('JournalIssue')
    issue_volume = _find_elem_text(issue, 'Volume')
    issue_issue = _find_elem_text(issue, 'Issue')

    issue_pub_date = issue.find('PubDate')
    if issue_pub_date is not None:
        # Get issue year
        issue_year = _find_elem_text(issue_pub_date, 'Year')
        issue_year = int(issue_year) if issue_year else None

    else:
        issue_year = None

    return {
        "volume": issue_volume,
        "issue": issue_issue,
        "year": issue_year
    }



[docs]
def get_issn_info(
    medline_citation: ET.Element,
    get_issns_from_nlm: str = "never"
):
    """Given a medline citation, get the issn info from the article

    Parameters
    ----------
    medline_citation : xml.etree.ElementTree.Element
        The MedlineCitation element of the PubMed XML tree.
    get_issns_from_nlm : Literal['never', 'missing', 'always']
        Whether to recover ISSN values from the NLM catalog. Options are
        'never', 'missing', and 'always'. If 'missing', then the ISSN
        values will be recovered from the NLM catalog if they are not found
        in the XML. If 'always', then the ISSN values will be recovered from
        the NLM catalog regardless of whether they are found in the XML.
        Default is 'never' (i.e., never recover from NLM catalog regardless
        of whether they are found in the XML).

    Returns
    -------
    dict
        A dictionary journal, issue, and ISSN info. The structure is as
        follows:
        {
            "journal_title": str,
            "journal_abbrev": str,
            "journal_nlm_id": str,
            "issn_dict": {
                "issn": str,
                "issn_l": str,
                "type": "print"|"electronic"|"other",
                "alternate_issns": List[Tuple[str, str]]  # Optional
            },
            "issue_dict": {
                "volume": str,
                "issue": str,
                "year": int
            }
        }
    """
    if get_issns_from_nlm not in ['never', 'missing', 'always']:
        raise ValueError("get_issns_from_nlm must be one of 'never', "
                         "'missing', or 'always'")
    # Journal info
    journal = medline_citation.find('Article/Journal')
    journal_title = _find_elem_text(journal, 'Title')
    journal_abbrev = _find_elem_text(journal, 'ISOAbbreviation')

    # Issue info
    issue_info = _get_issue_info(journal)

    # Get the ISSN from the article record
    issn_dict = {}
    issn_element = journal.find("ISSN")
    if issn_element is not None:
        issn_type = issn_element.attrib.get("IssnType", "other").lower()
        issn = issn_element.text
        issn_dict["issn"] = issn
        issn_dict["type"] = issn_type

    # Get the linking ISSN from the article record
    issn_linking = _find_elem_text(medline_citation,
                                   "MedlineJournalInfo/ISSNLinking")
    if issn_linking:
        issn_dict["issn_l"] = issn_linking

    nlm_id = _find_elem_text(medline_citation,
                             'MedlineJournalInfo/NlmUniqueID')

    # Get ISSN values from the NLM catalog
    if nlm_id and (
            get_issns_from_nlm == 'always' or
            get_issns_from_nlm == 'missing' and not any(issn_dict.values())
    ):
        nlm_issn_list = get_issns_for_journal(nlm_id)
        if nlm_issn_list:
            issn_dict['alternate_issns'] = nlm_issn_list

    return {
        "journal_title": journal_title,
        "journal_abbrev": journal_abbrev,
        "journal_nlm_id": nlm_id,
        "issn_dict": issn_dict,
        "issue_dict": issue_info,
    }



def _get_journal_info(medline_citation, get_issns_from_nlm: bool):
    # Journal info
    journal = medline_citation.find('Article/Journal')
    journal_title = _find_elem_text(journal, 'Title')
    journal_abbrev = _find_elem_text(journal, 'ISOAbbreviation')

    # Issue info
    issue_info = _get_issue_info(journal)

    # Add the ISSN from the article record
    issn_set = set()
    issn = _find_elem_text(journal, 'ISSN')
    if issn:
        issn_set.add(issn)

    # Add the Linking ISSN from the article record
    issn_linking = _find_elem_text(medline_citation,
                                   'MedlineJournalInfo/ISSNLinking')
    if issn_linking:
        issn_set.add(issn_linking)

    # Now get the list of ISSNs from the NLM Catalog
    nlm_id = _find_elem_text(medline_citation,
                             'MedlineJournalInfo/NlmUniqueID')
    if nlm_id and get_issns_from_nlm:
        nlm_issn_list = get_issns_for_journal(nlm_id)
        if nlm_issn_list:
            issn_set.update(v for _, v in nlm_issn_list)

    # Remove any duplicate issns
    issn_list = list(issn_set)

    return {
        'journal_title': journal_title,
        'journal_abbrev': journal_abbrev,
        'issn_list': issn_list,
        'issn_l': issn_linking,
        'journal_nlm_id': nlm_id,
        'issue': issue_info['issue'],
        'volume': issue_info['volume'],
        'year': issue_info['year'],
    }


def _parse_date_elem(date_elem, with_time=False):
    # Parse a date XML element with <Year>, <Month>, <Day> children (and
    # optionally <Hour>, <Minute>) into a dict of integers.
    res = {}
    year = _find_elem_text(date_elem, "Year")
    if year is not None and year.isdigit():
        res["year"] = int(year)
    month = _find_elem_text(date_elem, "Month")
    if month is not None:
        # Month may be spelled out as "Jul" or "Aug" etc, or may be zero
        # padded, e.g. 03 for March. Convert to integer in either case.
        if month.isdigit():
            res["month"] = int(month)
        else:
            res["month"] = datetime.strptime(month, "%b").month
    day = _find_elem_text(date_elem, "Day")
    if day is not None and day.isdigit():
        res["day"] = int(day)
    if with_time:
        hour = _find_elem_text(date_elem, "Hour")
        if hour is not None and hour.isdigit():
            res["hour"] = int(hour)
        minute = _find_elem_text(date_elem, "Minute")
        if minute is not None and minute.isdigit():
            res["minute"] = int(minute)
    return res


def _get_article_dates(pubmed_article_data: ET.Element) -> dict:
    # Get the article dates from an XML <PubmedArticle> element
    # In MedlineCitation, get, if available:
    #  - DateRevised (contains XML elements <Year>, <Month>, <Day>)
    #  - DateCompleted (contains XML elements <Year>, <Month>, <Day>)
    #  - Article
    #    - ArticleDate
    #    - Journal -> JournalIssue -> PubDate (contains up to three elements:
    #                                          <Year>, <Month>, <Day>)
    # In PubmedData, under History, get all PubMedPubDate elements with their
    # PubStatus attribute, each of which contains XML elements <Year>, <Month>,
    # <Day> and possibly <Hour> and <Minute>

    results = {}

    # Get the dates from the MedlineCitation element
    mc = "./MedlineCitation"
    date_paths = [
        ("date_completed", mc + "/DateCompleted"),
        ("date_revised", mc + "/DateRevised"),
        ("article_date", mc + "/Article/ArticleDate"),
        ("journal_pub_date",
         mc + "/Article/Journal/JournalIssue/PubDate"),
    ]
    for date_type, date_path in date_paths:
        dt = pubmed_article_data.find(date_path)
        if dt is None:
            continue
        results[date_type] = _parse_date_elem(dt)

    # Get dates from History element
    pubmed_pub_dates = \
        pubmed_article_data.findall("./PubmedData/History/PubMedPubDate")
    results["pubmed_pubdates"] = {}
    for pmpd in pubmed_pub_dates:
        pub_status = pmpd.attrib.get("PubStatus", None)
        if pub_status is not None:
            results["pubmed_pubdates"][pub_status] = \
                _parse_date_elem(pmpd, with_time=True)

    return results


def _get_pubmed_publication_date(pubmed_data):
    date_dict = dict.fromkeys(['year', 'month', 'day'])

    # Order potential statuses in order of preferences
    status_list = ['pubmed', 'accepted', 'revised', 'received', 'entrez']

    # Look for various statuses, in order of preference as PubStatus in
    # PubmedPubDate
    for status in status_list:
        pubmed_pub_date = \
                    pubmed_data.find('./History/PubMedPubDate[@PubStatus="%s"]'
                                     % status)
        if pubmed_pub_date is not None:
            break
    else:
        logger.warning("Could not find pub date in: \n%s"
                       % ET.tostring(pubmed_data).decode('utf-8'))
        return date_dict

    def _find_date(element):
        value = _find_elem_text(pubmed_pub_date, element)
        return int(value) if value else None

    # Get date elements from extracted pubmed_pub_date element
    for date_elem in ['Year', 'Month', 'Day']:
        date_dict[date_elem.lower()] = _find_date(date_elem)

    return date_dict


def _parse_author(author_info, include_details=False):
    if not include_details:
        last_name = author_info.find("LastName")
        if last_name is None:
            return None
        return last_name.text

    parsed_info = {
        "last_name": None,
        "first_name": None,
        "initials": None,
        "suffix": None,
        "identifier": None,
        "affiliations": None,
    }
    affiliations = []
    for element in author_info.findall("*"):
        if element.tag == "AffiliationInfo":
            affiliation_name = element.find("Affiliation").text
            identifiers = [e.text for e in element.findall("Identifier")]
            affiliations.append({"name": affiliation_name, "identifiers": identifiers})
        elif element.tag == "LastName":
            parsed_info["last_name"] = element.text
        elif element.tag == "ForeName":
            parsed_info["first_name"] = element.text
        elif element.tag == "Initials":
            parsed_info["initials"] = element.text
        elif element.tag == "Suffix":
            parsed_info["suffix"] = element.text
        elif element.tag == "Identifier":
            parsed_info["identifier"] = element.text
        # This happens for some working groups credited as authors
        elif element.tag == "CollectiveName":
            parsed_info["collective_name"] = element.text
    parsed_info["affiliations"] = affiliations
    return parsed_info


def _get_references(reference_list, only_pmid=True):
    """Return a list of references for an article."""
    if reference_list is None:
        return None

    references = []
    for reference in reference_list.findall('Reference'):
        pmid = _find_elem_text(reference, '*/ArticleId[@IdType="pubmed"]')
        if only_pmid:
            references.append(pmid)
        else:
            ref_dict = {
                'pmid': pmid,
                'doi': _find_elem_text(reference, '*/ArticleId[@IdType="doi"]'),
                'pmcid': _find_elem_text(reference, '*/ArticleId[@IdType="pmcid"]'),
                'citation': _find_elem_text(reference, 'Citation'),
            }
            references.append(ref_dict)
    return references


def _get_article_info(medline_citation, pubmed_data, detailed_authors=False):
    article = medline_citation.find('Article')
    pmid = _find_elem_text(medline_citation, './PMID')
    pii = _find_elem_text(article,
                          './ELocationID[@EIdType="pii"][@ValidYN="Y"]')

    # Look for the DOI in the ELocationID field...
    doi = _find_elem_text(article,
                          './ELocationID[@EIdType="doi"][@ValidYN="Y"]')

    # ...and if that doesn't work, look in the ArticleIdList
    if doi is None:
        doi = _find_elem_text(pubmed_data, './/ArticleId[@IdType="doi"]')

    # Try to get the PMCID
    pmcid = _find_elem_text(pubmed_data, './/ArticleId[@IdType="pmc"]')

    # Title
    title = _get_title_from_article_element(article)

    # Author list
    author_elems = article.findall('AuthorList/Author')
    author_names = None if author_elems is None \
        else [_parse_author(au, detailed_authors) for au in author_elems]

    # Get the page number entry
    page = _find_elem_text(article, 'Pagination/MedlinePgn')

    # Get publication types ('Clinical Trial', 'Review', etc.)
    # They are under the PublicationTypeList element inside the Article
    # Each PublicationType element has a mesh_id set a 'UI' attribute
    # and a type set as the text of the element
    pub_type_list = article.findall('PublicationTypeList/PublicationType')
    pub_types = None if pub_type_list is None else \
        [
            # Get the mesh_id and type for each publication type
            {'mesh_id': pt.attrib.get('UI', None), 'type': pt.text}
            for pt in pub_type_list
        ]

    return {
        'pmid': pmid,
        'pii': pii,
        'doi': doi,
        'pmcid': pmcid,
        'title': title,
        'authors': author_names,
        'page': page,
        'publication_types': pub_types,
    }



[docs]
def get_metadata_from_pubmed_article(
    pubmed_article,
    get_issns_from_nlm: bool = False,
    get_abstracts: bool = False,
    prepend_title: bool = False,
    mesh_annotations: bool = True,
    detailed_authors: bool = False,
    references_included: str = None
):
    """Get metadata for a single PubmedArticle element.

    Parameters
    ----------
    pubmed_article : xml.etree.ElementTree.Element
        A PubmedArticle element from a Pubmed XML tree.
    get_issns_from_nlm : Optional[bool]
        Look up the full list of ISSN number for the journal associated with
        the article, which helps to match articles to CrossRef search results.
        Defaults to False, since it slows down performance.
    get_abstracts : Optional[bool]
        Indicates whether to include the Pubmed abstract in the results.
        Default: False
    prepend_title : Optional[bool]
        If get_abstracts is True, specifies whether the article title should
        be prepended to the abstract text. Default: False
    mesh_annotations : Optional[bool]
        If True, extract mesh annotations from the pubmed entries and include
        in the returned data. If false, don't. Default: True
    detailed_authors : Optional[bool]
        If True, extract as many of the author details as possible, such as
        first name, identifiers, and institutions. If false, only last names
        are returned. Default: False
    references_included : Optional[str]
        If 'detailed', include detailed references in the results. If 'pmid', only include
        the PMID of the reference. If None, don't include references. Default: None

    Returns
    -------
    : Dict
        A dict containing the following fields: 'doi', 'title', 'authors',
        'journal_title', 'journal_abbrev', 'journal_nlm_id', 'issn_list',
        'page', 'volume', 'issue', 'issue_pub_date', 'mesh_annotations',
        'publication_date', 'detailed_publication_dates', 'abstract',
        'publication_types' and 'references'.
    """
    medline_citation = pubmed_article.find('./MedlineCitation')
    pubmed_data = pubmed_article.find('PubmedData')

    # Build the result
    result = {}
    article_info = _get_article_info(medline_citation, pubmed_data, detailed_authors)
    result.update(article_info)
    journal_info = _get_journal_info(medline_citation, get_issns_from_nlm)
    result.update(journal_info)
    if mesh_annotations:
        context_info = _get_annotations(medline_citation)
        result.update(context_info)
    if references_included:
        references = _get_references(pubmed_data.find('ReferenceList'),
                                     only_pmid=(references_included == 'pmid'))
        result['references'] = references

    publication_date = _get_pubmed_publication_date(pubmed_data)
    result['publication_date'] = publication_date
    result["detailed_publication_dates"] = _get_article_dates(pubmed_article)

    # Get the abstracts if requested
    if get_abstracts:
        abstract = _abstract_from_article_element(
            medline_citation.find('Article'),
            prepend_title=prepend_title
        )
        result['abstract'] = abstract
    return result




[docs]
def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
                               get_abstracts=False, prepend_title=False,
                               mesh_annotations=True, detailed_authors=False,
                               references_included=None):
    """Get metadata for an XML tree containing PubmedArticle elements.

    Documentation on the XML structure can be found at:
        - https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html
        - https://www.nlm.nih.gov/bsd/licensee/elements_alphabetical.html

    Parameters
    ----------
    tree : xml.etree.ElementTree
        ElementTree containing one or more PubmedArticle elements.
    get_issns_from_nlm : Optional[bool]
        Look up the full list of ISSN number for the journal associated with
        the article, which helps to match articles to CrossRef search results.
        Defaults to False, since it slows down performance.
    get_abstracts : Optional[bool]
        Indicates whether to include the Pubmed abstract in the results.
        Default: False
    prepend_title : Optional[bool]
        If get_abstracts is True, specifies whether the article title should
        be prepended to the abstract text. Default: False
    mesh_annotations : Optional[bool]
        If True, extract mesh annotations from the pubmed entries and include
        in the returned data. If false, don't. Default: True
    detailed_authors : Optional[bool]
        If True, extract as many of the author details as possible, such as
        first name, identifiers, and institutions. If false, only last names
        are returned. Default: False
    references_included : Optional[str]
        If 'detailed', include detailed references in the results. If 'pmid', only include
        the PMID of the reference. If None, don't include references. Default: None

    Returns
    -------
    dict[str, dict]
        Dictionary indexed by PMID. Each value is a dict containing the
        following fields: 'doi', 'title', 'authors', 'journal_title',
        'journal_abbrev', 'journal_nlm_id', 'issn_list', 'page',
        'volume', 'issue', 'issue_pub_date', 'mesh_annotations',
        'publication_date', 'abstract', 'publication_types' and 'references'.
    """
    # Iterate over the articles and build the results dict
    results = {}
    pm_articles = tree.findall('./PubmedArticle')
    for pm_article in pm_articles:
        result = get_metadata_from_pubmed_article(
            pm_article,
            get_issns_from_nlm=get_issns_from_nlm,
            get_abstracts=get_abstracts,
            prepend_title=prepend_title,
            mesh_annotations=mesh_annotations,
            detailed_authors=detailed_authors,
            references_included=references_included,
        )
        # Add to dict
        results[result["pmid"]] = result

    return results




[docs]
def get_mesh_annotations(pmid):
    """Return a list of MeSH annotations for a given PubMed ID.

    Parameters
    ----------
    pmid : str
        A PubMed ID.

    Returns
    -------
    list of dict
        A list of dicts that represent MeSH annotations with the following keys:
        "mesh" representing the MeSH ID, "text" the standrd name associated with
        the MeSH ID, "major_topic" a boolean flag set depending on whether
        the given MeSH ID is assigned as a major topic to the article, and
        "qualifier" which is a MeSH qualifier ID associated with the annotation,
        if available, otherwise None.
    """
    full_xml_tree = get_full_xml(pmid)
    if full_xml_tree is None:
        return None
    medline_citation = full_xml_tree.find('PubmedArticle/MedlineCitation')
    if medline_citation is None:
        return None
    annotations = _get_annotations(medline_citation)
    return annotations.get('mesh_annotations')



def _get_annotations(medline_citation):

    def _major_topic(e):
        if e is not None and e.get('MajorTopicYN').upper() == 'Y':
            return True
        return False

    info = []
    for elem in medline_citation.findall('.//MeshHeading'):
        dname = elem.find('DescriptorName')
        qualifier_elems = elem.findall('QualifierName')

        mid = dname.attrib['UI']
        major = _major_topic(dname) or any(_major_topic(qual) for qual
                                           in qualifier_elems)
        qualifiers = [{'text': qual.text, 'mesh': qual.attrib['UI']}
                      for qual in qualifier_elems]
        qual = qualifiers[0] if qualifiers else None

        info.append({'type': 'main', 'mesh': mid, 'text': dname.text,
                     'major_topic': major,
                     # This is only here for backwards compatibility with
                     # INDRA DB which expects a single qualifier or None and
                     # turns the single qualifier into an int internally, so
                     # we can't easily put a joined string of multiple
                     # qualifiers here.
                     'qualifier': qual,
                     # This is the proper full list of qualifiers
                     'qualifiers': qualifiers})
    for elem in medline_citation.findall('.//SupplMeshList/SupplMeshName'):
        info.append({'type': 'supplementary', 'mesh': elem.attrib['UI'], 'text': elem.text,
                     'qualifier': None, 'qualifiers': [],
                     'major_topic': False})
    return {'mesh_annotations': info}



[docs]
def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
                         get_abstracts=False, prepend_title=False,
                         detailed_authors=False, references_included=None):
    """Get article metadata for up to 200 PMIDs from the Pubmed database.

    Parameters
    ----------
    pmid_list : list of str
        Can contain 1-200 PMIDs.
    get_issns_from_nlm : bool
        Look up the full list of ISSN number for the journal associated with
        the article, which helps to match articles to CrossRef search results.
        Defaults to False, since it slows down performance.
    get_abstracts : bool
        Indicates whether to include the Pubmed abstract in the results.
    prepend_title : bool
        If get_abstracts is True, specifies whether the article title should
        be prepended to the abstract text.
    detailed_authors : bool
        If True, extract as many of the author details as possible, such as
        first name, identifiers, and institutions. If false, only last names
        are returned. Default: False
    references_included : Optional[str]
        If 'detailed', include detailed references in the results. If 'pmid', only include
        the PMID of the reference. If None, don't include references. Default: None

    Returns
    -------
    dict[str, dict]
        Dictionary indexed by PMID. Each value is a dict containing the
        following fields: 'doi', 'title', 'authors', 'journal_title',
        'journal_abbrev', 'journal_nlm_id', 'issn_list', 'page'.
    """
    if len(pmid_list) > 200:
        raise ValueError("Metadata query is limited to 200 PMIDs at a time.")
    params = {'db': 'pubmed',
              'retmode': 'xml',
              'id': pmid_list}
    tree = send_request(pubmed_fetch, params)
    if tree is None:
        return None
    return get_metadata_from_xml_tree(tree, get_issns_from_nlm, get_abstracts,
                                      prepend_title,
                                      detailed_authors=detailed_authors,
                                      references_included=references_included)




[docs]
def get_metadata_for_all_ids(pmid_list, get_issns_from_nlm=False,
                             get_abstracts=False, prepend_title=False,
                             detailed_authors=False, references_included=None):
    """Get article metadata for any number of PMIDs from the Pubmed database.

    This differs from get_metadata_for_ids in that it can handle any number of
    PMIDs, and implements batch iteration to avoid the 200 PMID limit of the
    Pubmed API.

    Parameters
    ----------
    pmid_list : list of str
        Can contain any number of PMIDs.
    get_issns_from_nlm : bool
        Look up the full list of ISSN number for the journal associated with
        the article, which helps to match articles to CrossRef search results.
        Defaults to False, since it slows down performance.
    get_abstracts : bool
        Indicates whether to include the Pubmed abstract in the results.
    prepend_title : bool
        If get_abstracts is True, specifies whether the article title should
        be prepended to the abstract text.
    detailed_authors : bool
        If True, extract as many of the author details as possible, such as
        first name, identifiers, and institutions. If false, only last names
        are returned. Default: False
    references_included : Optional[str]
        If 'detailed', include detailed references in the results. If 'pmid', only include
        the PMID of the reference. If None, don't include references. Default: None

    Returns
    -------
    dict of dicts
        Dictionary indexed by PMID. Each value is a dict containing the
        following fields: 'doi', 'title', 'authors', 'journal_title',
        'journal_abbrev', 'journal_nlm_id', 'issn_list', 'page'.
    """
    all_metadata = {}
    for ids in tqdm.tqdm(batch_iter(pmid_list, 200), desc='Retrieving metadata',
                         total=len(pmid_list)//200+1):
        time.sleep(0.1)
        metadata = get_metadata_for_ids(list(ids),
                                        get_issns_from_nlm=get_issns_from_nlm,
                                        get_abstracts=get_abstracts,
                                        prepend_title=prepend_title,
                                        detailed_authors=detailed_authors,
                                        references_included=references_included)
        if metadata is not None:
            all_metadata.update(metadata)
    return all_metadata




[docs]
@lru_cache(maxsize=1000)
def get_issns_for_journal(nlm_id):
    """Get a dict of the ISSN numbers for a journal given its NLM ID.

    Information on NLM XML DTDs is available at
    https://www.nlm.nih.gov/databases/dtd/
    """
    params = {'db': 'nlmcatalog',
              'retmode': 'xml',
              'id': nlm_id}
    tree = send_request(pubmed_fetch, params)
    if tree is None:
        return None
    issn_list = [(e.attrib.get("IssnType", "other").lower(), e.text)
                 for e in tree.findall('.//ISSN')]
    issn_linking = tree.find('.//ISSNLinking')
    if issn_linking:
        issn_list.append(("linking", issn_linking.text))

    # No ISSNs found!
    if not any(v for k, v in issn_list):
        return None
    return issn_list




[docs]
def get_nct_ids_from_article_xml(article) -> List[str]:
    """Extract NCT IDs from a PubMed article XML

    Parameters
    ----------
    article :
        An XML Element representing a PubMed article.

    Returns
    -------
    :
        The NCT IDs associated with the given PubMed article.
    """
    # Find all DataBank elements in the article and check if they are from
    # ClinicalTrials.gov. If so, extract the AccessionNumberList and append
    # the AccessionNumbers to the list
    nct_ids = []
    for databank in article.findall(".//DataBank"):
        name = databank.find("DataBankName")
        if name is not None and name.text == "ClinicalTrials.gov":
            accession_list = databank.find("AccessionNumberList")
            if accession_list is not None:
                for acc in accession_list.findall("AccessionNumber"):
                    nct_ids.append(acc.text)
    return nct_ids




[docs]
def get_nct_ids_from_full_xml(tree) -> Dict[str, List[str]]:
    """Get the NCT IDs for a given PubMed ID from the full XML.

    Parameters
    ----------
    tree :
        An XML Element representing the full PubMed XML tree.

    Returns
    -------
    dict[str, list[str]]
        A list of NCT IDs associated with the given PubMed ID.
    """
    # Find all AccessionNumbers under ClinicalTrials.gov
    nct_ids_by_pmid = {}
    for article in tqdm.tqdm(
        tree.findall(".//PubmedArticle"),
        desc="Extracting NCT IDs",
        unit_scale=True,
        unit='article'
    ):
        pmid_element = article.find(".//PMID")
        if pmid_element is None or not pmid_element.text:
            continue
        pmid = pmid_element.text
        nct_ids = get_nct_ids_from_article_xml(article)
        if nct_ids:
            if pmid not in nct_ids_by_pmid:
                nct_ids_by_pmid[pmid] = []
            nct_ids_by_pmid[pmid] = nct_ids

    return nct_ids_by_pmid




[docs]
def get_nct_ids_for_pmid(pmid: str) -> List[str]:
    """Get the NCT IDs for a given PubMed ID.

    Parameters
    ----------
    pmid : str
        A PubMed ID.

    Returns
    -------
    list[str]
        A list of NCT IDs associated with the given PubMed ID.
    """
    full_xml_tree = get_full_xml(pmid)
    if full_xml_tree is None:
        return []
    nct_ids_by_pmid = get_nct_ids_from_full_xml(full_xml_tree)
    return nct_ids_by_pmid.get(pmid, [])




[docs]
def get_nct_ids_for_pmids(
    pmid_list: List[str],
    rest_api_fallback: bool = True
) -> Dict[str,
List[str]]:
    """Get the NCT IDs for a list of PubMed IDs.

    Parameters
    ----------
    pmid_list : list[str]
        A list of PubMed IDs.
    rest_api_fallback : bool
        If True, fall back to the REST API if the full XML fetch using the
        edirect CLI fails.

    Returns
    -------
    dict
        A dictionary mapping each PubMed ID to a list of NCT IDs associated with it.
    """
    try:
        full_xml_tree = get_full_xml_by_pmids(pmid_list)
        if full_xml_tree is None:
            return {}
        return get_nct_ids_from_full_xml(full_xml_tree)
    except RuntimeError as e:
        if rest_api_fallback:
            logger.warning(f"Failed to fetch full XML for PMIDs {pmid_list}, "
                            "falling back to REST API.")
            nct_ids_by_pmid = {}
            for pmid in tqdm.tqdm(
                pmid_list,
                desc='Looking up NCT IDs',
                unit_scale=True,
                unit='PMID'
            ):
                nct_ids = get_nct_ids_for_pmid(pmid)
                if nct_ids:
                    nct_ids_by_pmid[pmid] = nct_ids
        else:
            logger.error(f"Failed to fetch full XML for PMIDs {pmid_list}")
            raise e




[docs]
def expand_pagination(pages):
    """Convert a page number to long form, e.g., from 456-7 to 456-457."""
    # If there is no hyphen, it's a single page, and we're good to go
    parts = pages.split('-')
    if len(parts) == 1: # No hyphen, so no split
        return pages
    elif len(parts) == 2:
        start = parts[0]
        end = parts[1]
        # If the end is the same number of digits as the start, then we
        # don't change anything!
        if len(start) == len(end):
            return pages
        # Otherwise, replace the last digits of start with the digits of end
        num_end_digits = len(end)
        new_end = start[:-num_end_digits] + end
        return '%s-%s' % (start, new_end)
    else: # More than one hyphen, something weird happened
        logger.warning("Multiple hyphens in page number: %s" % pages)
        return pages




[docs]
def get_substance_annotations(pubmed_id: str) -> List[str]:
    """Return substance MeSH ID for a given PubMedID.

    Note that substance annotations often overlap with MeSH annotations,
    however, there are cases where a substance annotation is not available
    under MeSH annotations.

    Parameters
    ----------
    pubmed_id :
        PubMedID ID whose substance MeSH ID will be returned.

    Returns
    -------
    :
        Substance MeSH IDs corresponding to the given PubMed paper or
        if None present or a failed query, an empty list will be returned.
    """
    root = get_full_xml(pubmed_id)
    nodes = root.findall('.//MedlineCitation/ChemicalList')
    if len(nodes) == 0:
        logger.error('Could not retrieve substance MeSH IDs for %s' % pubmed_id)
        return []

    uid = [b.attrib.get('UI') for node in nodes
           for c in list(node) for b in c.iter('*')
           if 'UI' in b.attrib]
    return uid




[docs]
def get_all_ids(search_term):
    """Return all PMIDs for a search term using the edirect CLI.

    This function complements the `get_id` function which uses the PubMed
    REST API but is limited to 10k results and is very difficult to
    generalize to systematically fetch all IDs if there are more than 10k
    results. This function uses the edirect CLI which implements logic
    for paging over results.

    This function only works if edirect is installed and is on your PATH.
    See https://www.ncbi.nlm.nih.gov/books/NBK179288/ for instructions.

    Parameters
    ----------
    search_term : str
        A term for which the PubMed search should be performed.

    Returns
    -------
    list[str]
        A list of PMIDs for the given search term.
    """
    cmd = f'esearch -db pubmed -query "{search_term}" | efetch -format uid'
    res = subprocess.getoutput(cmd)
    if not isinstance(res, str) or "not found" in res:
        raise RuntimeError("The esearch utility could not be found. "
                           "This function only works if edirect is "
                           "installed and is visible on your PATH. "
                           "See https://www.ncbi.nlm.nih.gov/books/NBK179288/ "
                           "for instructions.")
    # Output is divided by new lines
    elements = res.split('\n')
    # The CLI automatically retries on errors, subprocess.getoutput unfortunately
    # adds the error message associated with the retry to the output, so we need
    # to filter out non-numeric elements
    pmids = [e for e in elements if e.isdigit()]
    return pmids




[docs]
def get_publication_types(article: ET.Element):
    """Return the set of PublicationType for the article

    Parameters
    ----------
    article :
        The XML element for the article. Typically, this is a PubmedArticle
        node.

    Returns
    -------
    : set[str]
        A set of publication type
    """
    return {pt.text for pt in article.find('.//PublicationTypeList')}




[docs]
def is_retracted(pubmed_id: str) -> bool:
    """Return True if the article with the given PMID has been retracted.

    Parameters
    ----------
    pubmed_id :
        The PMID of the paper to check.

    Returns
    -------
    :
        True if the paper has been retracted, False otherwise.
    """
    return retractions.is_retracted(pubmed_id)




[docs]
def generate_retractions_file(
    xml_path: str, download_missing: bool = False, max_workers: int = 1
):
    """Generate a CSV file of retracted papers from the PubMed XML.

    Parameters
    ----------
    xml_path :
        Path to the directory holding the PubMed XML files. The files will
        be globbed from this directory using the pattern 'pubmed*.xml.gz'.
    download_missing :
        If True, download any missing XML files from the PubMed FTP server.
        Default: False. Note: A full download of the PubMed XML files takes up
        to 5 hours.
    max_workers :
        Number of parallel download threads. Default: 1 (serial). Maximum: 4.
    """
    if download_missing:
        ensure_xml_files(xml_path, max_workers=max_workers)
    retractions = set()

    files = glob.glob(os.path.join(xml_path, 'pubmed*.xml.gz'))
    if not files:
        raise FileNotFoundError(f"No PubMed XML files found in {xml_path}")

    for xml_file in tqdm.tqdm(files, desc="Processing PubMed XML files"):
        xml_str = gzip.open(xml_file).read()
        tree = ET.XML(xml_str, parser=UTB())
        for article in tree.findall('.//PubmedArticle'):
            pub_types = get_publication_types(article)
            if "Retracted Publication" in pub_types:
                pmid = article.find('.//PMID').text
                retractions.add(pmid)

    if not retractions:
        logger.warning(f"No retractions found from {len(files)} XML files")
        return

    logger.info(f"Writing {len(retractions)} retractions to {RETRACTIONS_FILE}")
    with open(RETRACTIONS_FILE, 'w') as fh:
        fh.write('\n'.join(sorted(retractions)))




[docs]
def ensure_xml_files(
    xml_path: str,
    retries: int = 3,
    raise_http_error: bool = True,
    raise_checksum_error: bool = False,
    force: bool = False,
    max_workers: int = 1,
) -> None:
    """Ensure that the XML files are downloaded and up to date.

    This function downloads the full archive published by PubMed at
    https://ftp.ncbi.nlm.nih.gov/pubmed/baseline and
    https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles which contains citation
    records holding metadata and abstracts in XML format. The baseline archive
    is updated yearly, while the baseline archive is updated daily and includes
    new, revised, and deleted citations. After downloading this archive, it can
    be used to extract e.g. mesh annotation of articles, publication year,
    retractions, author information. The files in the archive constsist of a
    set of gzipped XML files, with each XML file containing multiple records for
    a set of publications. See
    https://dtd.nlm.nih.gov/ncbi/pubmed/doc/out/250101/index.html for more
    information about this archive.

    Use this function to create a complete data set from all available citation
    records. If only a subset of records is needed, use e.g.
    `get_metadata_for_all_ids` in this module to get metadata from a list of
    pmids.

    Parameters
    ----------
    xml_path :
        Path to the directory holding the PubMed XML files. The files will
        be globbed from this directory using the pattern 'pubmed*.xml.gz'.
    retries :
        Number of times to retry downloading an individual XML file if there
        is an HTTP error. Default: 3.
    raise_http_error :
        If True, raise an HTTPError if an XML file cannot be downloaded after
        the specified number of retries. If False, log a warning and skip the
        file.
    raise_checksum_error :
        If True, raise a ValueError if the checksum of a downloaded XML file
        does not match the expected checksum. If False, log a warning and skip
        the file. Default: False.
    force :
        If True, force re-download of all XML files, even if they already exist.
    max_workers :
        Number of parallel download threads. Default: 1 (serial). Maximum: 4.
    """
    max_workers = max(1, min(4, max_workers))
    xml_path = Path(xml_path)
    xml_path.mkdir(parents=True, exist_ok=True)

    basefiles = [u for u in _get_urls(pubmed_archive_baseline)]
    updatefiles = [u for u in _get_urls(pubmed_archive_update)]

    total_files = len(basefiles) + len(updatefiles)

    def _ensure_one(xml_url: str) -> bool:
        xml_file_path = xml_path.joinpath(xml_url.split("/")[-1])
        if force or not xml_file_path.exists():
            success = _download_xml_gz(
                xml_url,
                xml_file_path,
                raise_http_error=raise_http_error,
                md5_check=raise_checksum_error,
                retries=retries,
            )
            if not success:
                tqdm.tqdm.write(f"Error downloading {xml_url}, skipping")
                return False
        return True

    urls = basefiles + updatefiles
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm.tqdm(
            executor.map(_ensure_one, urls),
            total=total_files,
            desc="Downloading PubMed XML files",
            unit="file",
        ))
    skips = sum(1 for success in results if not success)

    if skips:
        logger.warning(f"Skipped {skips} out of {total_files}")
        missing_files = [
            xml_url for xml_url in basefiles + updatefiles
            if not xml_path.joinpath(xml_url.split("/")[-1]).exists()
        ]
        missing_str = "\n".join(missing_files)

        logger.warning(f"Missing {len(missing_files)} files:\n{missing_str}")



def _get_urls(index_url: str):
    """Get the paths to all XML files on the PubMed FTP server."""
    from bs4 import BeautifulSoup

    logger.info("Getting URL paths from %s" % index_url)

    # Get root page with the XML files
    response = requests.get(index_url)
    response.raise_for_status()

    # Make soup
    # Todo: see if it's possible to get the lists of files directly from the
    #  FTP server, rather than scraping the HTML
    soup = BeautifulSoup(response.text, "html.parser")

    # Append trailing slash if not present
    index_url = index_url if index_url.endswith("/") else index_url + "/"

    # Loop over all links
    for link in soup.find_all("a"):
        href = link.get("href")
        # yield if href matches
        # 'pubmed<2 digit year>n<4 digit file index>.xml.gz'
        # but skip the md5 files
        if href and href.startswith("pubmed") and href.endswith(".xml.gz"):
            yield index_url + href


def _download_xml_gz(
    xml_url: str,
    xml_file: Path,
    raise_http_error: bool = False,
    md5_check: bool = True,
    retries: int = 3,
) -> bool:
    try:
        resp = requests.get(xml_url)
        resp.raise_for_status()
    except requests.exceptions.RequestException as e:
        if retries > 0:
            tqdm.tqdm.write(f"Error downloading {xml_url}, retrying." + str(e))
            sleep(1)
            return _download_xml_gz(
                xml_url, xml_file, raise_http_error, md5_check, retries - 1
            )
        else:
            if raise_http_error:
                logger.error(f"Error downloading {xml_url}: {e}")
                raise e
            tqdm.tqdm.write(f"Error downloading {xml_url}, skipping")
            return False

    if md5_check:
        from hashlib import md5
        md5_resp = requests.get(xml_url + ".md5")
        checksum = md5(resp.content).hexdigest()
        expected_checksum = re.search(
            r"[0-9a-z]+(?=\n)", md5_resp.content.decode("utf-8")
        ).group()
        if checksum != expected_checksum:
            logger.warning(
                f"Checksum mismatch for {xml_url}, skipping download"
            )
            raise ValueError("Checksum mismatch")

    # Write the file xml.gz file
    with xml_file.open("wb") as fh:
        fh.write(resp.content)

    return True



[docs]
def get_pmid_to_package_url_mapping(fname=None) -> Dict[str, str]:
    """Return a mapping from PMID to a PMC .tar.gz package URL.

    The assignment of PMIDs to specific PMC downloadable files
    in which extended article elements are available does not follow
    a specific pattern and therefore explicit mappings from PMIDs
    to PMC package URLs are required.

    Parameters
    ----------
    fname : Optional[str]
        Optional path to a CSV file containing the mappings data file
        serving as a cache. It can be obtained from
        https://ftp.ncbi.nlm.nih.gov/pub/pmc/deprecated/oa_file_list.csv.
        If not provided, it is downloaded from this URL.

    Returns
    -------
    :
        A dictionary mapping PMIDs to PMC package URLs.
    """
    if fname:
        reader = csv.DictReader(open(fname, 'r'))
    else:
        logger.info("Downloading PMC file list")
        res = requests.get(pmid_to_pmc_download_url)
        res.raise_for_status()
        reader = csv.DictReader(StringIO(res.text))
    mapping = {
        row["PMID"]: f"{pmc_ftp_base_url}/{row['File']}"
        for row in tqdm.tqdm(reader, desc="Generating PMID to PMC URL mapping")
    }
    return mapping




[docs]
def download_package_for_pmid(pmid: str, out_dir: str,
                              mapping: Optional[Dict[str, str]] = None):
    """Return path to the PMC package downloaded for a given PMID.

    Parameters
    ----------
    pmid : str
        The PubMed ID for which the package should be downloaded.
    out_dir : str
        The directory where the package should be downloaded.
    mapping : Optional[Dict[str, str]]
        A mapping from PMIDs to PMC package URLs. If None, the mapping
        is fetched from the NCBI FTP server (slow). The mapping can be
        obtained from
        https://ftp.ncbi.nlm.nih.gov/pub/pmc/deprecated/oa_file_list.csv
        and loaded using `get_pmid_to_package_url_mapping`.

    Returns
    -------
    : str
        The path to the downloaded package file.
    """
    if mapping is None:
        mapping = get_pmid_to_package_url_mapping()

    if pmid not in mapping:
        raise ValueError(f"PMID {pmid} not found in the PMC OA mapping.")

    url = mapping[pmid]
    filename = os.path.basename(url)
    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    tar_path = out_path / filename

    logger.info(f"Downloading {url}")
    r = requests.get(url, stream=True)
    r.raise_for_status()
    with open(tar_path, 'wb') as f:
        for chunk in r.iter_content():
            f.write(chunk)
    return tar_path.as_posix()




[docs]
def download_package_for_pmids(pmid_list: List[str], out_dir: str,
                               mapping: Optional[Dict[str, str]] = None):
    """Return paths of PMC packages downloaded for a given list of PMIDs.

    Parameters
    ----------
    pmid_list : List[str]
        A list of PubMed IDs for which the packages should be downloaded.
    out_dir : str
        The directory where the packages should be downloaded.
    mapping : Optional[Dict[str, str]]
        A mapping from PMIDs to PMC package URLs. If None, the mapping
        is fetched from the NCBI FTP server (slow). The mapping can be
        obtained from
        https://ftp.ncbi.nlm.nih.gov/pub/pmc/deprecated/oa_file_list.csv
        and loaded using `get_pmid_to_package_url_mapping`.

    Returns
    -------
    : dict
        A dictionary mapping PMIDs to the paths of the downloaded package files.
        If a package could not be downloaded, the PMID key will not be present.
    """
    if mapping is None:
        mapping = get_pmid_to_package_url_mapping()
    fnames = {}
    for pmid in pmid_list:
        try:
            fname = download_package_for_pmid(pmid, out_dir, mapping)
            fnames[pmid] = fname
        except Exception as e:
            logger.info(f"Error downloading {pmid}: {e}")
    return fnames



class Retractions:
    def __init__(self):
        self.retractions = None

    def is_retracted(self, pmid):
        if self.retractions is None:
            with open(RETRACTIONS_FILE, 'r') as fh:
                self.retractions = set(fh.read().splitlines())
        return pmid in self.retractions


retractions = Retractions()