Module pubmedpy.efetch
Expand source code Browse git
import collections
import typing
import lxml.etree
def extract_all(elem: lxml.etree._Element) -> dict:
"""
Extract a dictionary of all supported fields from a <PubmedArticle> XML element
"""
result = collections.OrderedDict()
result.update(extract_identifiers(elem))
result["journal"] = elem.findtext("MedlineCitation/MedlineJournalInfo/MedlineTA")
result["journal_nlm_id"] = elem.findtext(
"MedlineCitation/MedlineJournalInfo/NlmUniqueID"
)
result["title"] = elem.findtext("MedlineCitation/Article/ArticleTitle")
result["publication_date"] = extract_publication_date(elem)
result["authors"] = extract_authors(elem)
return result
def extract_identifiers(elem: lxml.etree._Element) -> dict:
"""
Exctract a dictionary of identifiers from a <PubmedArticle> XML element
"""
identifiers = dict()
renamer = {
"pubmed": "pmid",
"pmc": "pmcid",
"doi": "doi",
}
for id_type, id_type_name in renamer.items():
identifiers[id_type_name] = elem.findtext(
f"PubmedData/ArticleIdList/ArticleId[@IdType={id_type!r}]"
)
if identifiers["doi"]:
# convert DOIs to all lowercase for standardization
identifiers["doi"] = identifiers["doi"].lower()
return identifiers
def extract_publication_date(elem: lxml.etree._Element) -> typing.Optional[str]:
"""
Select the publication date from a <PubmedArticle> XML element
"""
dates = [
_date_elem_to_str(x)
for x in elem.findall("MedlineCitation/Article/ArticleDate")
]
if dates:
return sorted(dates)[0]
else:
pubdate = elem.find("MedlineCitation/Article/Journal/JournalIssue/PubDate")
return _date_elem_to_str(pubdate)
_month_abbrev_to_int = {
"Jan": 1,
"Feb": 2,
"Mar": 3,
"Apr": 4,
"May": 5,
"Jun": 6,
"Jul": 7,
"Aug": 8,
"Sep": 9,
"Oct": 10,
"Nov": 11,
"Dec": 12,
}
def _date_elem_to_str(elem: lxml.etree._Element) -> typing.Optional[str]:
"""
Convert an XML date object to a string like '2002', '2002-01', or '2002-01-05'.
"""
if elem is None:
return None
year = elem.findtext("Year")
try:
year = int(year)
except (ValueError, TypeError):
return None
month = elem.findtext("Month")
month = _month_abbrev_to_int.get(month, month)
try:
month = int(month)
except (ValueError, TypeError):
return f"{year:04d}"
day = elem.findtext("Day")
try:
day = int(day)
except (ValueError, TypeError):
return f"{year:04d}-{month:02d}"
return f"{year:04d}-{month:02d}-{day:02d}"
def extract_authors(elem: lxml.etree._Element) -> list:
"""
Exctract a list of authors from a <PubmedArticle> XML element
"""
authors = list()
author_elems = elem.findall("MedlineCitation/Article/AuthorList/Author")
for author_elem in author_elems:
authors.append(
{
"fore_name": author_elem.findtext("ForeName"),
"last_name": author_elem.findtext("LastName"),
"affiliations": [
x.text for x in author_elem.findall("AffiliationInfo/Affiliation")
],
}
)
return authors
Functions
def extract_all(elem: lxml.etree._Element) ‑> dict
-
Extract a dictionary of all supported fields from a
XML element Expand source code Browse git
def extract_all(elem: lxml.etree._Element) -> dict: """ Extract a dictionary of all supported fields from a <PubmedArticle> XML element """ result = collections.OrderedDict() result.update(extract_identifiers(elem)) result["journal"] = elem.findtext("MedlineCitation/MedlineJournalInfo/MedlineTA") result["journal_nlm_id"] = elem.findtext( "MedlineCitation/MedlineJournalInfo/NlmUniqueID" ) result["title"] = elem.findtext("MedlineCitation/Article/ArticleTitle") result["publication_date"] = extract_publication_date(elem) result["authors"] = extract_authors(elem) return result
-
Exctract a list of authors from a
XML element Expand source code Browse git
def extract_authors(elem: lxml.etree._Element) -> list: """ Exctract a list of authors from a <PubmedArticle> XML element """ authors = list() author_elems = elem.findall("MedlineCitation/Article/AuthorList/Author") for author_elem in author_elems: authors.append( { "fore_name": author_elem.findtext("ForeName"), "last_name": author_elem.findtext("LastName"), "affiliations": [ x.text for x in author_elem.findall("AffiliationInfo/Affiliation") ], } ) return authors
def extract_identifiers(elem: lxml.etree._Element) ‑> dict
-
Exctract a dictionary of identifiers from a
XML element Expand source code Browse git
def extract_identifiers(elem: lxml.etree._Element) -> dict: """ Exctract a dictionary of identifiers from a <PubmedArticle> XML element """ identifiers = dict() renamer = { "pubmed": "pmid", "pmc": "pmcid", "doi": "doi", } for id_type, id_type_name in renamer.items(): identifiers[id_type_name] = elem.findtext( f"PubmedData/ArticleIdList/ArticleId[@IdType={id_type!r}]" ) if identifiers["doi"]: # convert DOIs to all lowercase for standardization identifiers["doi"] = identifiers["doi"].lower() return identifiers
def extract_publication_date(elem: lxml.etree._Element) ‑> Union[str, NoneType]
-
Select the publication date from a
XML element Expand source code Browse git
def extract_publication_date(elem: lxml.etree._Element) -> typing.Optional[str]: """ Select the publication date from a <PubmedArticle> XML element """ dates = [ _date_elem_to_str(x) for x in elem.findall("MedlineCitation/Article/ArticleDate") ] if dates: return sorted(dates)[0] else: pubdate = elem.find("MedlineCitation/Article/Journal/JournalIssue/PubDate") return _date_elem_to_str(pubdate)