Source code for indra.literature.biorxiv_client

"""A client to obtain metadata and text content from bioRxiv
(and to some extent medRxiv) preprints."""
import re
import logging
import requests
import datetime


logger = logging.getLogger(__name__)


# Browser link at https://connect.biorxiv.org/relate/content/181
collection_url = 'https://connect.biorxiv.org/relate/collection_json.php?grp='
covid19_collection_id = '181'
bio_content_url = 'https://www.biorxiv.org/content/'
med_content_url = 'https://www.medrxiv.org/content/'


[docs]def get_collection_pubs(collection_id, min_date=None): """Get list of DOIs from a biorxiv/medrxiv collection. Parameters ---------- collection_id : str The identifier of the collection to fetch. min_date : Optional[datetime.datetime] A datetime object representing an cutoff. If given, only publications that were released on or after the given date are returned. By default, no date constraint is applied. Returns ------- list of dict A list of the publication entries which include the abstract and other metadata. """ res = requests.get(collection_url + collection_id) res.raise_for_status() pubs = res.json()['rels'] if min_date: new_rels = [] for pub in pubs: try: date = datetime.datetime.strptime(pub.get('rel_date'), '%Y-%m-%d') except Exception: continue if date >= min_date: new_rels.append(pub) return new_rels return pubs
[docs]def get_collection_dois(collection_id, min_date=None): """Get list of DOIs from a biorxiv/medrxiv collection. Parameters ---------- collection_id : str The identifier of the collection to fetch. min_date : Optional[datetime.datetime] A datetime object representing an cutoff. If given, only publications that were released on or after the given date are returned. By default, no date constraint is applied. Returns ------- list of dict The list of DOIs in the collection. """ pubs = get_collection_pubs(collection_id, min_date=min_date) dois = [pub.get('rel_doi') for pub in pubs if pub.get('rel_doi')] return dois
[docs]def get_pdf_xml_url_base(content): """Return base URL to PDF/XML based on the content of the landing page. Parameters ---------- content : str The content of the landing page for an rxiv paper. Returns ------- str or None The base URL if available, otherwise None. """ match = re.match('(?:.*)"citation_pdf_url" content="([^"]+).full.pdf"', content, re.S) if match: return match.groups()[0] return None
[docs]def get_text_url_base(content): """Return base URL to full text based on the content of the landing page. Parameters ---------- content : str The content of the landing page for an rxiv paper. Returns ------- str or None The base URL if available, otherwise None. """ match = re.match('(?:.*)"citation_html_url" content="([^"]+).full"', content, re.S) if match: return match.groups()[0] return None
[docs]def get_formats(pub): """Return formats available for a publication JSON. Parameters ---------- pub : dict The JSON dict description a publication. Returns ------- dict A dict with available formats as its keys (abstract, pdf, xml, txt) and either the content (in case of abstract) or the URL (in case of pdf, xml, txt) as the value. """ formats = {} if 'rel_abs' in pub: formats['abstract'] = pub['rel_abs'] # The publication JSON does not contain enough information generally # to identify the URL for the various formats. Therefore we have to # load the landing page for the article and parse out various URLs # to reliably get to the desired content. landing_page_res = requests.get(pub['rel_link']) # The URL for the full PDF and XML is often different in format than # the rel_site URL so we need to get the link to it from the content # of the landing page. The XML URL doesn't explicitly appear in the # page content therefore we work with the citation_pdf_url and get # URLs for both the PDF and the XML. pdf_xml_url_base = get_pdf_xml_url_base(landing_page_res.text) if pdf_xml_url_base: formats['pdf'] = pdf_xml_url_base + '.full.pdf' formats['xml'] = pdf_xml_url_base + '.source.xml' text_url_base = get_text_url_base(landing_page_res.text) if text_url_base: formats['txt'] = text_url_base + 'txt' return formats
[docs]def get_content_from_pub_json(pub, format): """Get text content based on a given format from a publication JSON. In the case of abstract, the content is returned from the JSON directly. For pdf, the content is returned as bytes that can be dumped into a file. For txt and xml, the text is processed out of either the raw XML or text content that rxiv provides. Parameters ---------- pub : dict The JSON dict description a publication. format : str The format, if available, via which the content should be obtained. """ if format == 'abstract': return pub.get('rel_abstract') formats = get_formats(pub) if format not in formats: logger.warning('Content not available in format %s' % format) return None # If we're looking for an abstract, that is directly accessible # in the pub JSON so we can just return it if format == 'abstract': return formats.get('abstract') # For PDFs we return the result in bytes that can then be dumped # into a file. elif format == 'pdf': return requests.get(formats[format]).content # For xml and text, we return the result as str elif format == 'xml': return get_text_from_rxiv_xml(requests.get(formats[format]).text) elif format == 'txt': return get_text_from_rxiv_text(requests.get(formats[format]).text)
[docs]def get_text_from_rxiv_xml(rxiv_xml): """Return clean text from the raw rxiv xml content. Parameters ---------- rxiv_xml : str The content of the rxiv full xml as obtained from the web. Returns ------- str The text content stripped out from the raw full xml. """ # FIXME: this is a very naive initial solution, we should instead # traverse the XML structure properly to get the content. text = re.sub('<.*?>', '', rxiv_xml) return text
[docs]def get_text_from_rxiv_text(rxiv_text): """Return clean text from the raw rxiv text content. This function parses out the title, headings and subheadings, and the content of sections under headings/subheadings. It filters out some irrelevant content e.g., references and footnotes. Parameters ---------- rxiv_text : str The content of the rxiv full text as obtained from the web. Returns ------- str The text content stripped out from the raw full text. """ lines = [line.strip() for line in rxiv_text.split('\n') if line.strip()] current_section = 'title' text = lines[0] + '\n' line_idx = 1 skip_section = {'References', 'Footnotes', 'Acknowledgements', 'Supplementary Figures', 'Declaration of Interests', 'Author Contributions', 'Code and data availability'} for line in lines[line_idx:]: line_idx += 1 match = re.match('## (.+)', line) if match: current_section = match.groups()[0] break while line_idx < len(lines): for line in lines[line_idx:]: line_idx += 1 match_heading = re.match('## (.+)', line) match_subheading = re.match('### (.+)', line) if match_heading: current_section = match_heading.groups()[0] break elif current_section in skip_section: continue elif match_subheading: text += (match_subheading.groups()[0] + '\n') else: text += (line + '\n') return text
if __name__ == '__main__': import os import json fname = 'covid19_pubs.json' if os.path.exists(fname): with open(fname, 'r') as fh: covid19_pubs = json.load(fh) else: covid19_pubs = get_collection_pubs(covid19_collection_id) with open(fname, 'w') as fh: json.dump(covid19_pubs, fh) contents = {} for pub in covid19_pubs: doi = pub['rel_doi'] formats = get_formats(pub) if 'txt' in formats: print('Getting text for %s' % doi) txt = get_content_from_pub_json(pub, 'txt') elif 'xml' in formats: print('Getting xml for %s' % doi) txt = get_content_from_pub_json(pub, 'xml') else: print('Getting abstract for %s' % doi) txt = get_content_from_pub_json(pub, 'abstract') contents[doi] = txt with open('covid19_contents', 'w') as fh: json.dump(contents, fh)