Module pubmedpy.xml
Expand source code Browse git
import contextlib
import importlib
import mimetypes
import os
import zipfile
from typing import Iterable, Tuple
from lxml import etree
from .utils import PathType
_encoding_to_module = {
"gzip": "gzip",
"bzip2": "bz2",
"xz": "lzma",
}
def iterparse_xml(path: PathType) -> Iterable[etree._Element]:
"""
First yield the ElementTree root, then yield elements from an XML file.
"""
# Automatically detect compression
path = os.fspath(path)
_, encoding = mimetypes.guess_type(path)
if encoding is None:
opener = open
else:
module = _encoding_to_module[encoding]
opener = importlib.import_module(module).open
# Open file and yield from the element tree
with opener(path, "rb") as read_file:
context = etree.iterparse(read_file, events=("start", "end"))
yield next(context)[1]
yield from (elem for event, elem in context if event == "end")
def iter_extract_elems(path: PathType, tag: str) -> Iterable[etree._Element]:
"""
Return elements of the specified tag from XML produced by pubmedpy.eutilities.download_pubmed_ids.
For memory-efficiency, the XML element tree root is cleared after before yielding the next element.
"""
path = os.fspath(path)
parser = iterparse_xml(path)
root = next(parser)
for elem in parser:
if elem.tag != tag:
continue
yield elem
root.clear()
root.clear()
def yield_etrees_from_zip(path: PathType) -> Iterable[Tuple[str, etree.ElementTree]]:
"""
Read members of a zip file with an `.xml` extension.
"""
with zipfile.ZipFile(path) as zip_file:
for name in zip_file.namelist():
if not name.endswith(".xml"):
continue
with zip_file.open(name) as read_file:
element_tree = etree.parse(read_file)
yield name, element_tree
Functions
def iter_extract_elems(path: Union[os.PathLike, str], tag: str) ‑> Iterable[lxml.etree._Element]
-
Return elements of the specified tag from XML produced by pubmedpy.eutilities.download_pubmed_ids. For memory-efficiency, the XML element tree root is cleared after before yielding the next element.
Expand source code Browse git
def iter_extract_elems(path: PathType, tag: str) -> Iterable[etree._Element]: """ Return elements of the specified tag from XML produced by pubmedpy.eutilities.download_pubmed_ids. For memory-efficiency, the XML element tree root is cleared after before yielding the next element. """ path = os.fspath(path) parser = iterparse_xml(path) root = next(parser) for elem in parser: if elem.tag != tag: continue yield elem root.clear() root.clear()
def iterparse_xml(path: Union[os.PathLike, str]) ‑> Iterable[lxml.etree._Element]
-
First yield the ElementTree root, then yield elements from an XML file.
Expand source code Browse git
def iterparse_xml(path: PathType) -> Iterable[etree._Element]: """ First yield the ElementTree root, then yield elements from an XML file. """ # Automatically detect compression path = os.fspath(path) _, encoding = mimetypes.guess_type(path) if encoding is None: opener = open else: module = _encoding_to_module[encoding] opener = importlib.import_module(module).open # Open file and yield from the element tree with opener(path, "rb") as read_file: context = etree.iterparse(read_file, events=("start", "end")) yield next(context)[1] yield from (elem for event, elem in context if event == "end")
def yield_etrees_from_zip(path: Union[os.PathLike, str]) ‑> Iterable[Tuple[str,
]] -
Read members of a zip file with an
.xml
extension.Expand source code Browse git
def yield_etrees_from_zip(path: PathType) -> Iterable[Tuple[str, etree.ElementTree]]: """ Read members of a zip file with an `.xml` extension. """ with zipfile.ZipFile(path) as zip_file: for name in zip_file.namelist(): if not name.endswith(".xml"): continue with zip_file.open(name) as read_file: element_tree = etree.parse(read_file) yield name, element_tree