library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)
library(DT)
library(knitr)
options(stringsAsFactors = FALSE)
write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
}
Here, we parse the LabeledIn resource, which can be downloaded here. The resource contains two components: an expert curated set of indications (1) and a crowdsourced set of indications (2).
read.litab <- function(path, fieldnames) {
# Function to read LabeledIn datatables
read.table(path, sep = '|', quote = '', comment.char = '', na.strings = '',
colClasses = 'character', col.names = fieldnames)
}
# Read raw expert LabeledIn output
eli.fields <- c('study_drug_label_ID', 'DailyMed_SPL_ID', 'UMLS_CUIs',
'IN_RXCUI', 'SCDF_RXCUI', 'SCD_RXCUI', 'Other_SCDF_RXCUI', 'Other_SCD_RXCUI')
eli.df <- file.path('download', 'LabeledIn_Structured_Results.txt') %>%
read.litab(fieldnames = eli.fields)
# Read raw crowdsourced LabeledIn output
cli.df <- file.path('download', 'Crowdsourcing', 'Crowdsourced_Results.txt') %>%
read.litab(fieldnames = c(eli.fields, 'majority_vote'))
table(cli.df$majority_vote) %>% as.data.frame() %>% knitr::kable(col.names = c('Majority Vote', 'Count'))
Majority Vote | Count |
---|---|
no-char_risk | 211 |
no-contra | 131 |
no-not-disease | 212 |
no-unrelated | 58 |
uncertain | 15 |
yes | 2377 |
Number of labels in:
expand_indications <- function(df) {
data.frame(
rxnorm_id = df$rxnorm_id,
disease_cui = stringr::str_extract_all(df$disease_cuis, "C\\d+")[[1]],
label_id = df$label_id,
resource = df$resource)
}
combined.df <- eli.df %>%
dplyr::mutate(resource = 'expert') %>%
dplyr::bind_rows(
cli.df %>%
dplyr::mutate(resource = 'crowd') %>%
dplyr::filter(majority_vote == 'yes') %>%
dplyr::select(-majority_vote))
indication.df <- combined.df %>%
dplyr::transmute(disease_cuis = UMLS_CUIs, rxnorm_id = IN_RXCUI, label_id = study_drug_label_ID, resource) %>%
dplyr::filter(! is.na(disease_cuis)) %>%
dplyr::rowwise() %>%
dplyr::do(expand_indications(.)) %>%
dplyr::ungroup() %>%
dplyr::group_by(rxnorm_id, disease_cui) %>%
dplyr::summarize(
n_labels = n_distinct(label_id),
expert = as.integer(any('expert' %in% resource)),
crowd = as.integer(any('crowd' %in% resource))
) %>%
dplyr::ungroup() %>%
dplyr::arrange(rxnorm_id, desc(n_labels), disease_cui)
indication.df %>% DT::datatable()