Module pubmedpy.esummary
Expand source code Browse git
import collections
import contextlib
import datetime
import itertools
import locale
import logging
import re
import threading
from typing import List, Optional
import pandas
import tqdm
import lxml.etree
from .xml import iter_extract_elems
from .utils import PathType
locale_lock = threading.Lock()
@contextlib.contextmanager
def setlocale(name: str):
"""
Context manager to temporarily set locale for datetime.datetime.strptime
https://stackoverflow.com/a/24070673/4651668
"""
with locale_lock:
saved = locale.setlocale(locale.LC_ALL)
try:
yield locale.setlocale(locale.LC_ALL, name)
finally:
locale.setlocale(locale.LC_ALL, saved)
def parse_date_text(text: str) -> datetime.date:
"""
Parse an `eSummaryResult/DocSum/Item[@Name='History']/Item[@Type='Date']`
element.
The time on the date is discarded. A `datetime.date` object is returned
"""
with setlocale("C"):
return datetime.datetime.strptime(text, "%Y/%m/%d %H:%M").date()
def parse_pubdate_text(text: str) -> datetime.date:
"""
Parse the text contained by the following elements:
`eSummaryResult/DocSum/Item[@Name='PubDate' @Type='Date']`
`eSummaryResult/DocSum/Item[@Name='EPubDate' @Type='Date']`
See https://www.nlm.nih.gov/bsd/licensee/elements_article_source.html
A `datetime.date` object is returned.
"""
return datetime.datetime.strptime(text, "%Y %b %d").date()
def parse_esummary_history(docsum: lxml.etree._Element) -> dict:
"""
docsum is an xml Element.
"""
# Extract all historical dates
date_pairs = list()
seen = set()
for item in docsum.findall("Item[@Name='History']/Item[@Type='Date']"):
name = item.get("Name")
try:
date_ = parse_date_text(item.text)
except ValueError as e:
id_ = int(docsum.findtext("Id"))
msg = f"article {id_}; name: {name}; " f"date: {item.text}; error: {e}"
logging.warning(msg)
continue
date_pair = name, date_
if date_pair in seen:
continue
seen.add(date_pair)
date_pairs.append(date_pair)
date_pairs.sort(key=lambda x: x[0])
history = collections.OrderedDict()
for name, group in itertools.groupby(date_pairs, key=lambda x: x[0]):
for i, (name, date_) in enumerate(group):
history[f"{name}_{i}"] = date_
return history
def parse_esummary_pubdates(docsum: lxml.etree._Element) -> dict:
"""
Parse PubDate and EPubDate. Infer first published date.
"""
pubdates = collections.OrderedDict()
for key, name in ("pub", "PubDate"), ("epub", "EPubDate"):
xpath = f"Item[@Name='{name}'][@Type='Date']"
text = docsum.findtext(xpath)
try:
pubdates[key] = parse_pubdate_text(text)
except ValueError as e:
id_ = int(docsum.findtext("Id"))
msg = f"article {id_}; name: {key}; " f"date: {text}; error: {e}"
logging.info(msg)
continue
dates = set(pubdates.values())
dates.discard(None)
if dates:
pubdates["published"] = min(dates)
return pubdates
def parse_esummary_article_info(elem: lxml.etree._Element) -> dict:
"""
Extract general article information
"""
article = collections.OrderedDict()
article["pubmed_id"] = int(elem.findtext("Id"))
article["journal_nlm_id"] = elem.findtext("Item[@Name='NlmUniqueID']")
article["journal"] = elem.findtext("Item[@Name='Source']")
article["title"] = elem.findtext("Item[@Name='Title']")
article["doi"] = elem.findtext("Item[@Name='DOI']")
# https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/
article["publication_types"] = " | ".join(
x.text for x in elem.findall("Item[@Name='PubTypeList']/Item[@Name='PubType']")
)
# get incoming citation count. https://github.com/dhimmel/pubmedpy/issues/2
pmc_cited_by_count = elem.findtext("Item[@Name='PmcRefCount']")
try:
pmc_cited_by_count = int(pmc_cited_by_count)
except (TypeError, ValueError):
pmc_cited_by_count = None
article["pmc_cited_by_count"] = pmc_cited_by_count
return article
def parse_esummary(elem: lxml.etree._Element) -> dict:
"""
Extract pubmed, journal, and date information from an eSummaryResult/DocSum
"""
article = parse_esummary_article_info(elem)
article.update(parse_esummary_pubdates(elem))
article.update(parse_esummary_history(elem))
return article
def extract_articles_from_esummaries(
path: PathType, n_articles: Optional[int] = None, tqdm=tqdm.tqdm
) -> List[dict]:
"""
Extract a list of articles (dictionaries with date information) from a
an eSummaryResult XML file. Specify `n_articles` to enable a progress bar.
"""
if n_articles is not None:
progress_bar = tqdm(total=n_articles, unit="articles")
articles = list()
for elem in iter_extract_elems(path, tag="DocSum"):
article = parse_esummary(elem)
articles.append(article)
if n_articles is not None:
progress_bar.update(1)
if n_articles is not None:
progress_bar.close()
return articles
def articles_to_dataframe(articles: List[dict]) -> pandas.DataFrame:
"""
Convert a list of articles created by `extract_articles_from_esummaries`
into a pandas.DataFrame.
"""
article_df = pandas.DataFrame(articles)
article_df = article_df.sort_values(by="pubmed_id")
# Enforce a consistent column ordering
columns = article_df.columns[2:].tolist()
columns = [
"pubmed_id",
"journal_nlm_id",
"journal",
"doi",
*sorted(x for x in columns if re.search("pub(?!med)", x)),
*sorted(x for x in columns if re.search("_[0-9]+$", x)),
"title",
"pmc_cited_by_count",
]
article_df = article_df[columns]
return article_df
Functions
def articles_to_dataframe(articles: List[dict]) ‑> pandas.core.frame.DataFrame
-
Convert a list of articles created by
extract_articles_from_esummaries()
into a pandas.DataFrame.Expand source code Browse git
def articles_to_dataframe(articles: List[dict]) -> pandas.DataFrame: """ Convert a list of articles created by `extract_articles_from_esummaries` into a pandas.DataFrame. """ article_df = pandas.DataFrame(articles) article_df = article_df.sort_values(by="pubmed_id") # Enforce a consistent column ordering columns = article_df.columns[2:].tolist() columns = [ "pubmed_id", "journal_nlm_id", "journal", "doi", *sorted(x for x in columns if re.search("pub(?!med)", x)), *sorted(x for x in columns if re.search("_[0-9]+$", x)), "title", "pmc_cited_by_count", ] article_df = article_df[columns] return article_df
def extract_articles_from_esummaries(path: Union[os.PathLike, str], n_articles: Union[int, NoneType] = None, tqdm=tqdm.std.tqdm) ‑> List[dict]
-
Extract a list of articles (dictionaries with date information) from a an eSummaryResult XML file. Specify
n_articles
to enable a progress bar.Expand source code Browse git
def extract_articles_from_esummaries( path: PathType, n_articles: Optional[int] = None, tqdm=tqdm.tqdm ) -> List[dict]: """ Extract a list of articles (dictionaries with date information) from a an eSummaryResult XML file. Specify `n_articles` to enable a progress bar. """ if n_articles is not None: progress_bar = tqdm(total=n_articles, unit="articles") articles = list() for elem in iter_extract_elems(path, tag="DocSum"): article = parse_esummary(elem) articles.append(article) if n_articles is not None: progress_bar.update(1) if n_articles is not None: progress_bar.close() return articles
def parse_date_text(text: str) ‑> datetime.date
-
Parse an
eSummaryResult/DocSum/Item[@Name='History']/Item[@Type='Date']
element. The time on the date is discarded. Adatetime.date
object is returnedExpand source code Browse git
def parse_date_text(text: str) -> datetime.date: """ Parse an `eSummaryResult/DocSum/Item[@Name='History']/Item[@Type='Date']` element. The time on the date is discarded. A `datetime.date` object is returned """ with setlocale("C"): return datetime.datetime.strptime(text, "%Y/%m/%d %H:%M").date()
def parse_esummary(elem: lxml.etree._Element) ‑> dict
-
Extract pubmed, journal, and date information from an eSummaryResult/DocSum
Expand source code Browse git
def parse_esummary(elem: lxml.etree._Element) -> dict: """ Extract pubmed, journal, and date information from an eSummaryResult/DocSum """ article = parse_esummary_article_info(elem) article.update(parse_esummary_pubdates(elem)) article.update(parse_esummary_history(elem)) return article
def parse_esummary_article_info(elem: lxml.etree._Element) ‑> dict
-
Extract general article information
Expand source code Browse git
def parse_esummary_article_info(elem: lxml.etree._Element) -> dict: """ Extract general article information """ article = collections.OrderedDict() article["pubmed_id"] = int(elem.findtext("Id")) article["journal_nlm_id"] = elem.findtext("Item[@Name='NlmUniqueID']") article["journal"] = elem.findtext("Item[@Name='Source']") article["title"] = elem.findtext("Item[@Name='Title']") article["doi"] = elem.findtext("Item[@Name='DOI']") # https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/ article["publication_types"] = " | ".join( x.text for x in elem.findall("Item[@Name='PubTypeList']/Item[@Name='PubType']") ) # get incoming citation count. https://github.com/dhimmel/pubmedpy/issues/2 pmc_cited_by_count = elem.findtext("Item[@Name='PmcRefCount']") try: pmc_cited_by_count = int(pmc_cited_by_count) except (TypeError, ValueError): pmc_cited_by_count = None article["pmc_cited_by_count"] = pmc_cited_by_count return article
def parse_esummary_history(docsum: lxml.etree._Element) ‑> dict
-
docsum is an xml Element.
Expand source code Browse git
def parse_esummary_history(docsum: lxml.etree._Element) -> dict: """ docsum is an xml Element. """ # Extract all historical dates date_pairs = list() seen = set() for item in docsum.findall("Item[@Name='History']/Item[@Type='Date']"): name = item.get("Name") try: date_ = parse_date_text(item.text) except ValueError as e: id_ = int(docsum.findtext("Id")) msg = f"article {id_}; name: {name}; " f"date: {item.text}; error: {e}" logging.warning(msg) continue date_pair = name, date_ if date_pair in seen: continue seen.add(date_pair) date_pairs.append(date_pair) date_pairs.sort(key=lambda x: x[0]) history = collections.OrderedDict() for name, group in itertools.groupby(date_pairs, key=lambda x: x[0]): for i, (name, date_) in enumerate(group): history[f"{name}_{i}"] = date_ return history
def parse_esummary_pubdates(docsum: lxml.etree._Element) ‑> dict
-
Parse PubDate and EPubDate. Infer first published date.
Expand source code Browse git
def parse_esummary_pubdates(docsum: lxml.etree._Element) -> dict: """ Parse PubDate and EPubDate. Infer first published date. """ pubdates = collections.OrderedDict() for key, name in ("pub", "PubDate"), ("epub", "EPubDate"): xpath = f"Item[@Name='{name}'][@Type='Date']" text = docsum.findtext(xpath) try: pubdates[key] = parse_pubdate_text(text) except ValueError as e: id_ = int(docsum.findtext("Id")) msg = f"article {id_}; name: {key}; " f"date: {text}; error: {e}" logging.info(msg) continue dates = set(pubdates.values()) dates.discard(None) if dates: pubdates["published"] = min(dates) return pubdates
def parse_pubdate_text(text: str) ‑> datetime.date
-
Parse the text contained by the following elements:
eSummaryResult/DocSum/Item[@Name='PubDate' @Type='Date']
eSummaryResult/DocSum/Item[@Name='EPubDate' @Type='Date']
See https://www.nlm.nih.gov/bsd/licensee/elements_article_source.html A
datetime.date
object is returned.Expand source code Browse git
def parse_pubdate_text(text: str) -> datetime.date: """ Parse the text contained by the following elements: `eSummaryResult/DocSum/Item[@Name='PubDate' @Type='Date']` `eSummaryResult/DocSum/Item[@Name='EPubDate' @Type='Date']` See https://www.nlm.nih.gov/bsd/licensee/elements_article_source.html A `datetime.date` object is returned. """ return datetime.datetime.strptime(text, "%Y %b %d").date()
def setlocale(name: str)
-
Context manager to temporarily set locale for datetime.datetime.strptime https://stackoverflow.com/a/24070673/4651668
Expand source code Browse git
@contextlib.contextmanager def setlocale(name: str): """ Context manager to temporarily set locale for datetime.datetime.strptime https://stackoverflow.com/a/24070673/4651668 """ with locale_lock: saved = locale.setlocale(locale.LC_ALL) try: yield locale.setlocale(locale.LC_ALL, name) finally: locale.setlocale(locale.LC_ALL, saved)