library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)
library(DT)
library(knitr)

options(stringsAsFactors = FALSE)

write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
  write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
}

Here, we parse the LabeledIn resource, which can be downloaded here. The resource contains two components: an expert curated set of indications (1) and a crowdsourced set of indications (2).

read.litab <- function(path, fieldnames) {
  # Function to read LabeledIn datatables
  read.table(path, sep = '|', quote = '', comment.char = '', na.strings = '',
             colClasses = 'character', col.names = fieldnames)
}

# Read raw expert LabeledIn output
eli.fields <- c('study_drug_label_ID', 'DailyMed_SPL_ID', 'UMLS_CUIs',
               'IN_RXCUI', 'SCDF_RXCUI', 'SCD_RXCUI', 'Other_SCDF_RXCUI', 'Other_SCD_RXCUI')
eli.df <- file.path('download', 'LabeledIn_Structured_Results.txt') %>%
  read.litab(fieldnames = eli.fields)

# Read raw crowdsourced LabeledIn output
cli.df <- file.path('download', 'Crowdsourcing', 'Crowdsourced_Results.txt') %>%
  read.litab(fieldnames = c(eli.fields, 'majority_vote'))

table(cli.df$majority_vote) %>% as.data.frame() %>% knitr::kable(col.names = c('Majority Vote', 'Count'))
Majority Vote Count
no-char_risk 211
no-contra 131
no-not-disease 212
no-unrelated 58
uncertain 15
yes 2377

Number of labels in:

expand_indications <- function(df) {
  data.frame(
    rxnorm_id = df$rxnorm_id,
    disease_cui = stringr::str_extract_all(df$disease_cuis, "C\\d+")[[1]],
    label_id = df$label_id,
    resource = df$resource)
}

combined.df <- eli.df %>% 
  dplyr::mutate(resource = 'expert') %>% 
  dplyr::bind_rows(
  cli.df %>% 
    dplyr::mutate(resource = 'crowd') %>%
    dplyr::filter(majority_vote == 'yes') %>%
    dplyr::select(-majority_vote))

indication.df <- combined.df %>% 
  dplyr::transmute(disease_cuis = UMLS_CUIs, rxnorm_id = IN_RXCUI, label_id = study_drug_label_ID, resource) %>%
  dplyr::filter(! is.na(disease_cuis)) %>% 
  dplyr::rowwise() %>%
  dplyr::do(expand_indications(.)) %>%
  dplyr::ungroup() %>% 
  dplyr::group_by(rxnorm_id, disease_cui) %>%
  dplyr::summarize(
    n_labels = n_distinct(label_id),
    expert = as.integer(any('expert' %in% resource)),
    crowd = as.integer(any('crowd' %in% resource))
    ) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(rxnorm_id, desc(n_labels), disease_cui)


indication.df %>% DT::datatable()