Module pubmedpy.eutilities

Expand source code Browse git
import time
import collections
import logging
from typing import IO

import requests
import lxml.etree
import tqdm


def esearch_query(
    payload: dict, retmax: int = 10000, sleep: float = 0.34, tqdm=tqdm.tqdm
):
    """
    Return identifiers using the ESearch E-utility.

    Set `tqdm=tqdm.notebook` to use the tqdm notebook interface.
    Set `tqdm=None` to disable the progress bar.
    """
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    payload["rettype"] = "xml"
    payload["retmax"] = retmax
    payload["retstart"] = 0
    ids = list()
    count = 1
    progress_bar = None
    while payload["retstart"] < count:
        response = requests.get(url, params=payload)
        tree = lxml.etree.fromstring(response.content)
        count = int(tree.findtext("Count"))
        if tqdm and not progress_bar:
            progress_bar = tqdm(total=count, unit="ids")
        add_ids = [id_.text for id_ in tree.findall("IdList/Id")]
        ids += add_ids
        payload["retstart"] += retmax
        if tqdm:
            progress_bar.update(len(add_ids))
        time.sleep(sleep)
    if tqdm:
        progress_bar.close()
    return ids


def download_pubmed_ids(
    ids: list,
    write_file: IO,
    endpoint: str = "esummary",
    retmax: int = 100,
    retmin: int = 20,
    sleep: float = 0.34,
    error_sleep: float = 10,
    tqdm=tqdm.tqdm,
):
    """
    Submit an ESummary or EFetch query for PubMed records and write results as xml
    to write_file.

    Set `tqdm=tqdm.notebook` to use the tqdm notebook interface.
    """
    # Base URL for PubMed's esummary eutlity
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{endpoint}.fcgi"

    # Set up progress stats
    n_total = len(ids)
    successive_errors = 0
    progress_bar = tqdm(total=n_total, unit="articles")
    initialize_xml = True

    # Set up queue
    idq = collections.deque()
    for i in range(0, len(ids), retmax):
        idq.append(ids[i : i + retmax])

    # Query until the queue is empty
    while idq:
        time.sleep(sleep)
        id_subset = idq.popleft()
        id_subset_len = len(id_subset)

        # Perform eutilities API request
        id_string = ",".join(map(str, id_subset))
        payload = {"db": "pubmed", "id": id_string, "rettype": "xml"}
        try:
            response = requests.get(url, params=payload)
            response.raise_for_status()
            tree = lxml.etree.fromstring(response.content)
            successive_errors = 0
        except Exception as e:
            successive_errors += 1
            logging.warning(
                f"{successive_errors} successive error: {id_subset_len} IDs"
                f"[{id_subset[0]} … {id_subset[-1]}] threw {e}"
            )
            if id_subset_len >= retmin * 2:
                mid = len(id_subset) // 2
                idq.appendleft(id_subset[:mid])
                idq.appendleft(id_subset[mid:])
            else:
                idq.appendleft(id_subset)
            time.sleep(error_sleep * successive_errors)
            continue

        # Write XML to file
        if initialize_xml:
            initialize_xml = False
            write_file.write(f"<{tree.tag}>\n")
        for elem in tree.getchildren():
            xml_str = lxml.etree.tostring(elem, encoding="unicode")
            write_file.write(xml_str.rstrip() + "\n")

        # Report progress
        progress_bar.update(id_subset_len)

    progress_bar.close()
    # Write final line of XML
    write_file.write(f"</{tree.tag}>\n")

Functions

def download_pubmed_ids(ids: list, write_file: , endpoint: str = 'esummary', retmax: int = 100, retmin: int = 20, sleep: float = 0.34, error_sleep: float = 10, tqdm=tqdm.std.tqdm)

Submit an ESummary or EFetch query for PubMed records and write results as xml to write_file.

Set tqdm=tqdm.notebook to use the tqdm notebook interface.

Expand source code Browse git
def download_pubmed_ids(
    ids: list,
    write_file: IO,
    endpoint: str = "esummary",
    retmax: int = 100,
    retmin: int = 20,
    sleep: float = 0.34,
    error_sleep: float = 10,
    tqdm=tqdm.tqdm,
):
    """
    Submit an ESummary or EFetch query for PubMed records and write results as xml
    to write_file.

    Set `tqdm=tqdm.notebook` to use the tqdm notebook interface.
    """
    # Base URL for PubMed's esummary eutlity
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{endpoint}.fcgi"

    # Set up progress stats
    n_total = len(ids)
    successive_errors = 0
    progress_bar = tqdm(total=n_total, unit="articles")
    initialize_xml = True

    # Set up queue
    idq = collections.deque()
    for i in range(0, len(ids), retmax):
        idq.append(ids[i : i + retmax])

    # Query until the queue is empty
    while idq:
        time.sleep(sleep)
        id_subset = idq.popleft()
        id_subset_len = len(id_subset)

        # Perform eutilities API request
        id_string = ",".join(map(str, id_subset))
        payload = {"db": "pubmed", "id": id_string, "rettype": "xml"}
        try:
            response = requests.get(url, params=payload)
            response.raise_for_status()
            tree = lxml.etree.fromstring(response.content)
            successive_errors = 0
        except Exception as e:
            successive_errors += 1
            logging.warning(
                f"{successive_errors} successive error: {id_subset_len} IDs"
                f"[{id_subset[0]} … {id_subset[-1]}] threw {e}"
            )
            if id_subset_len >= retmin * 2:
                mid = len(id_subset) // 2
                idq.appendleft(id_subset[:mid])
                idq.appendleft(id_subset[mid:])
            else:
                idq.appendleft(id_subset)
            time.sleep(error_sleep * successive_errors)
            continue

        # Write XML to file
        if initialize_xml:
            initialize_xml = False
            write_file.write(f"<{tree.tag}>\n")
        for elem in tree.getchildren():
            xml_str = lxml.etree.tostring(elem, encoding="unicode")
            write_file.write(xml_str.rstrip() + "\n")

        # Report progress
        progress_bar.update(id_subset_len)

    progress_bar.close()
    # Write final line of XML
    write_file.write(f"</{tree.tag}>\n")
def esearch_query(payload: dict, retmax: int = 10000, sleep: float = 0.34, tqdm=tqdm.std.tqdm)

Return identifiers using the ESearch E-utility.

Set tqdm=tqdm.notebook to use the tqdm notebook interface. Set tqdm=None to disable the progress bar.

Expand source code Browse git
def esearch_query(
    payload: dict, retmax: int = 10000, sleep: float = 0.34, tqdm=tqdm.tqdm
):
    """
    Return identifiers using the ESearch E-utility.

    Set `tqdm=tqdm.notebook` to use the tqdm notebook interface.
    Set `tqdm=None` to disable the progress bar.
    """
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    payload["rettype"] = "xml"
    payload["retmax"] = retmax
    payload["retstart"] = 0
    ids = list()
    count = 1
    progress_bar = None
    while payload["retstart"] < count:
        response = requests.get(url, params=payload)
        tree = lxml.etree.fromstring(response.content)
        count = int(tree.findtext("Count"))
        if tqdm and not progress_bar:
            progress_bar = tqdm(total=count, unit="ids")
        add_ids = [id_.text for id_ in tree.findall("IdList/Id")]
        ids += add_ids
        payload["retstart"] += retmax
        if tqdm:
            progress_bar.update(len(add_ids))
        time.sleep(sleep)
    if tqdm:
        progress_bar.close()
    return ids