Module pubmedpy.pmc_oai
Functions for querying the PubMed Central OAI-PMH service (PMC-OAI). More information is available at https://www.ncbi.nlm.nih.gov/pmc/tools/oai/
Expand source code Browse git
"""
Functions for querying the PubMed Central OAI-PMH service (PMC-OAI).
More information is available at https://www.ncbi.nlm.nih.gov/pmc/tools/oai/
"""
import functools
import logging
import zipfile
# URL to the OAI endpoint for PMC
endpoint = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi"
# Namespaces abbreviations for parsing PMC-OAI XML
namespaces = {
"oai": "http://www.openarchives.org/OAI/2.0/",
"jats": "https://jats.nlm.nih.gov/ns/archiving/1.2/",
"dtd": "https://dtd.nlm.nih.gov/ns/archiving/2.3/",
}
@functools.lru_cache()
def get_sickle():
"""
Return a sickle OAI harvester for PMC
"""
import sickle
return sickle.Sickle(endpoint=endpoint)
def get_sets_for_pmcid(pmcid):
"""
Return the OAI sets specified to include the provided PMC identifier.
"""
pmcid = str(pmcid)
if pmcid.upper().startswith("PMC"):
pmcid = pmcid[3:]
sickler = get_sickle()
record = sickler.GetRecord(
identifier=f"oai:pubmedcentral.nih.gov:{pmcid}", metadataPrefix="pmc_fm"
)
return record.header.setSpecs
def download_frontmatter_set(oai_set, path, tqdm=None, n_records=None):
"""
Download an OAI set to a zipped file specified by path. Each file in the zip archive contains
frontmatter XML for a single article from the set.
"""
import lxml.etree
sickler = get_sickle()
zip_file = zipfile.ZipFile(path, mode="w", compression=zipfile.ZIP_LZMA)
records = sickler.ListRecords(
metadataPrefix="pmc_fm", set=oai_set, ignore_deleted=True
)
if tqdm is not None:
records = tqdm(records, total=n_records, desc=oai_set)
for record in records:
article = record.xml.find("oai:metadata/{*}article", namespaces=namespaces)
if article is None:
logging.warning(f"failure to extract <article> from\n{record.raw}")
pmcid = article.findtext(
"{*}front/{*}article-meta/{*}article-id[@pub-id-type='pmcid']"
)
xml_str = lxml.etree.tostring(article, encoding="unicode")
zip_file.writestr(f"{pmcid}.xml", data=xml_str)
zip_file.close()
def _contrib_elem_is_corresp(contrib_elem):
if contrib_elem.find("{*}xref[@ref-type='corresp']") is not None:
return True
return contrib_elem.get("corresp", "no") == "yes"
def _get_id_to_affiliation(article) -> dict:
aff_elems = article.findall("{*}front/{*}article-meta//{*}aff")
id_to_affiliation = dict()
for elem in aff_elems:
affiliation = " ".join(elem.itertext())
affiliation = " ".join(affiliation.split()) # standardize whitespace
id_to_affiliation[elem.get("id")] = affiliation
return id_to_affiliation
def extract_authors_from_article(article):
"""
Extract author information from frontmatter XML into a list of dictionaries.
"""
pmcid = article.findtext(
"{*}front/{*}article-meta/{*}article-id[@pub-id-type='pmcid']"
)
contrib_elems = article.findall(
"{*}front/{*}article-meta/{*}contrib-group/{*}contrib[@contrib-type='author']"
)
id_to_affiliation = _get_id_to_affiliation(article)
authors = []
for i, contrib_elem in enumerate(contrib_elems):
fore_name = contrib_elem.findtext("{*}name/{*}given-names")
last_name = contrib_elem.findtext("{*}name/{*}surname")
aff_ids = [
aff.attrib["rid"]
for aff in contrib_elem.findall("{*}xref[@rid][@ref-type='aff']")
]
authors.append(
{
"pmcid": pmcid,
"position": i + 1,
"fore_name": _strip_str(fore_name),
"last_name": _strip_str(last_name),
"corresponding": int(_contrib_elem_is_corresp(contrib_elem)),
"reverse_position": len(contrib_elems) - i,
"affiliations": [id_to_affiliation[aff_id] for aff_id in aff_ids],
}
)
return authors
def _strip_str(value):
"""Strip whitespace if value is a string."""
if isinstance(value, str):
value = value.strip()
return value
Functions
def download_frontmatter_set(oai_set, path, tqdm=None, n_records=None)
-
Download an OAI set to a zipped file specified by path. Each file in the zip archive contains frontmatter XML for a single article from the set.
Expand source code Browse git
def download_frontmatter_set(oai_set, path, tqdm=None, n_records=None): """ Download an OAI set to a zipped file specified by path. Each file in the zip archive contains frontmatter XML for a single article from the set. """ import lxml.etree sickler = get_sickle() zip_file = zipfile.ZipFile(path, mode="w", compression=zipfile.ZIP_LZMA) records = sickler.ListRecords( metadataPrefix="pmc_fm", set=oai_set, ignore_deleted=True ) if tqdm is not None: records = tqdm(records, total=n_records, desc=oai_set) for record in records: article = record.xml.find("oai:metadata/{*}article", namespaces=namespaces) if article is None: logging.warning(f"failure to extract <article> from\n{record.raw}") pmcid = article.findtext( "{*}front/{*}article-meta/{*}article-id[@pub-id-type='pmcid']" ) xml_str = lxml.etree.tostring(article, encoding="unicode") zip_file.writestr(f"{pmcid}.xml", data=xml_str) zip_file.close()
-
Extract author information from frontmatter XML into a list of dictionaries.
Expand source code Browse git
def extract_authors_from_article(article): """ Extract author information from frontmatter XML into a list of dictionaries. """ pmcid = article.findtext( "{*}front/{*}article-meta/{*}article-id[@pub-id-type='pmcid']" ) contrib_elems = article.findall( "{*}front/{*}article-meta/{*}contrib-group/{*}contrib[@contrib-type='author']" ) id_to_affiliation = _get_id_to_affiliation(article) authors = [] for i, contrib_elem in enumerate(contrib_elems): fore_name = contrib_elem.findtext("{*}name/{*}given-names") last_name = contrib_elem.findtext("{*}name/{*}surname") aff_ids = [ aff.attrib["rid"] for aff in contrib_elem.findall("{*}xref[@rid][@ref-type='aff']") ] authors.append( { "pmcid": pmcid, "position": i + 1, "fore_name": _strip_str(fore_name), "last_name": _strip_str(last_name), "corresponding": int(_contrib_elem_is_corresp(contrib_elem)), "reverse_position": len(contrib_elems) - i, "affiliations": [id_to_affiliation[aff_id] for aff_id in aff_ids], } ) return authors
def get_sets_for_pmcid(pmcid)
-
Return the OAI sets specified to include the provided PMC identifier.
Expand source code Browse git
def get_sets_for_pmcid(pmcid): """ Return the OAI sets specified to include the provided PMC identifier. """ pmcid = str(pmcid) if pmcid.upper().startswith("PMC"): pmcid = pmcid[3:] sickler = get_sickle() record = sickler.GetRecord( identifier=f"oai:pubmedcentral.nih.gov:{pmcid}", metadataPrefix="pmc_fm" ) return record.header.setSpecs
def get_sickle()
-
Return a sickle OAI harvester for PMC
Expand source code Browse git
@functools.lru_cache() def get_sickle(): """ Return a sickle OAI harvester for PMC """ import sickle return sickle.Sickle(endpoint=endpoint)