library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)
library(DT)
library(knitr)

options(stringsAsFactors = FALSE)

write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
  write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
}

Here, we parse the LabeledIn resource, which can be downloaded here. The resource contains two components: an expert curated set of indications (1) and a crowdsourced set of indications (2).

read.litab <- function(path, fieldnames) {
  # Function to read LabeledIn datatables
  read.table(path, sep = '|', quote = '', comment.char = '', na.strings = '',
             colClasses = 'character', col.names = fieldnames)
}

# Read raw expert LabeledIn output
eli.fields <- c('study_drug_label_ID', 'DailyMed_SPL_ID', 'UMLS_CUIs',
               'IN_RXCUI', 'SCDF_RXCUI', 'SCD_RXCUI', 'Other_SCDF_RXCUI', 'Other_SCD_RXCUI')
eli.df <- file.path('download', 'LabeledIn_Structured_Results.txt') %>%
  read.litab(fieldnames = eli.fields)

# Read raw crowdsourced LabeledIn output
cli.df <- file.path('download', 'Crowdsourcing', 'Crowdsourced_Results.txt') %>%
  read.litab(fieldnames = c(eli.fields, 'majority_vote'))

table(cli.df$majority_vote) %>% as.data.frame() %>% knitr::kable(col.names = c('Majority Vote', 'Count'))

Majority Vote	Count
no-char_risk	211
no-contra	131
no-not-disease	212
no-unrelated	58
uncertain	15
yes	2377

Number of labels in:

Expert LabeledIn: 500
Crowdsourced LabeledIn: 3004

expand_indications <- function(df) {
  data.frame(
    rxnorm_id = df$rxnorm_id,
    disease_cui = stringr::str_extract_all(df$disease_cuis, "C\\d+")[[1]],
    label_id = df$label_id,
    resource = df$resource)
}

combined.df <- eli.df %>% 
  dplyr::mutate(resource = 'expert') %>% 
  dplyr::bind_rows(
  cli.df %>% 
    dplyr::mutate(resource = 'crowd') %>%
    dplyr::filter(majority_vote == 'yes') %>%
    dplyr::select(-majority_vote))

indication.df <- combined.df %>% 
  dplyr::transmute(disease_cuis = UMLS_CUIs, rxnorm_id = IN_RXCUI, label_id = study_drug_label_ID, resource) %>%
  dplyr::filter(! is.na(disease_cuis)) %>% 
  dplyr::rowwise() %>%
  dplyr::do(expand_indications(.)) %>%
  dplyr::ungroup() %>% 
  dplyr::group_by(rxnorm_id, disease_cui) %>%
  dplyr::summarize(
    n_labels = n_distinct(label_id),
    expert = as.integer(any('expert' %in% resource)),
    crowd = as.integer(any('crowd' %in% resource))
    ) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(rxnorm_id, desc(n_labels), disease_cui)


indication.df %>% DT::datatable()

Numbers of indications per LabeledIn resource:

expert: 1335
crowdsourced: 1516
intersect: 1

RxNORM Ingredients

Download csv formatted RxNORM from bioportal for the 2014AA UMLS release.

# Create a data.frame of ingredients (drugs)
ingredient.df <- indication.df %>%
  dplyr::group_by(rxnorm_id) %>%
  dplyr::summarize(n_indications = n()) %>%
  dplyr::left_join(
    y = combined.df %>%
      dplyr::transmute(label_id = study_drug_label_ID, rxnorm_id = IN_RXCUI) %>%
      dplyr::group_by(rxnorm_id) %>%
      dplyr::summarize(n_labels = n_distinct(label_id))
    )

ingredient.df <- file.path('download', 'RXNORM.csv.gz') %>%
  read.csv(colClasses = 'character', check.names = FALSE) %>% 
  dplyr::transmute(
    rxnorm_id = stringr::str_replace(`Class ID`, fixed('http://purl.bioontology.org/ontology/RXNORM/'), ''),
    rxnorm_name = `Preferred Label`,
    ingredient_cui = CUI) %>%
  dplyr::right_join(ingredient.df)

ingredient.df %>% DT::datatable()

MESH Diseases

Download csv formatted MESH from bioportal for the 2014AA UMLS release.

# Run only once

gz <- file.path('data', 'mesh-umls-map.txt.gz') %>% gzfile('w')

file.path('download', 'MESH.csv.gz') %>%
  read.csv(colClasses = 'character', check.names = FALSE) %>%
  dplyr::transmute(
    mesh_id = stringr::str_replace(`Class ID`, fixed('http://purl.bioontology.org/ontology/MESH/'), ''),
    mesh_name = `Preferred Label`,
    cuis = CUI) %>% 
  dplyr::rowwise() %>% 
  dplyr::do({data.frame(.,
    cui = stringr::str_split(.$cuis, fixed('|'))[[1]])
    }) %>% 
  dplyr::select(-cuis) %>%
  write.delim(gz); close(gz)

Not all umls diseases are getting mapped through MESH, so we will need to find a better solution.

The LabeledIn disease vocabulary is generated by:

2.4. Automatic disease recognition

The goal of this module was to identify all disease mentions as indication candidates from the textual descriptions of a given drug label. For this study, we prepared a disease lexicon using two seed ontologies, MeSH and SNOMED-CT, respectively useful for annotating scientific articles [30], [32] and [37] and clinical documents [31], [38] and [39]. The lexicon consists of 77464 concepts taken from: (i) the disease branch in MeSH, and (ii) the 11 disorder semantic types (UMLS disorder semantic types excluding ‘Finding’) in SNOMED-CT as recommended in a recent shared task [30].

As for the automatic tool, we applied MetaMap [27], a highly configurable program used for mapping biomedical texts to the UMLS identifying the mentions, offsets, and associated CUIs. We used the 2012 MetaMap Java API release that uses the 2012AB version of the UMLS Metathesaurus. We experimented with multiple settings of MetaMap, and the optimal setting method for this study is illustrated in Fig. 4.

# Create a data.frame of diseases
disease.df <- indication.df %>%
  dplyr::group_by(disease_cui) %>%
  dplyr::summarize(n_indications = n())

disease.df <- file.path('data', 'mesh-umls-map.txt.gz') %>%
  read.delim(quote = '', comment.char = '') %>%
  dplyr::select(disease_cui = cui, disease_mesh_id = mesh_id, disease_mesh_name = mesh_name) %>%
  dplyr::right_join(disease.df)

disease.df %>% DT::datatable()

Indications with disease and ingredient names.

Disease and ingredient names are provided for reference only. Many IDs failed to map because we used an indirect but convenient mapping strategy. If integrating this data into another resource, we suggest mapping by identifiers rather than names.

named.df <- indication.df %>%
  dplyr::left_join(disease.df %>% dplyr::select(disease_cui, disease_mesh_name)) %>% 
  dplyr::left_join(ingredient.df %>% dplyr::select(rxnorm_id, rxnorm_name))

named.df %T>% 
  write.delim(file.path('data', 'indications.tsv')) %>% 
  DT::datatable()

References

1. Khare R, Li J, Lu Z (2014) LabeledIn: Cataloging labeled indications for human drugs. Journal of Biomedical Informatics. doi:10.1016/j.jbi.2014.08.004

2. Khare R, Burger JD, Aberdeen JS, Tresner-Kirsch DW, Corrales TJ, Hirchman L, et al. (2015) Scaling drug indication curation through crowdsourcing. Database. doi:10.1093/database/bav016

To the extent possible under law, Daniel Himmelstein has waived all copyright and related or neighboring rights to Indications. This work is published from: United States.

Parsing LabeledIn

Daniel Himmelstein, Leo Brueggeman, Sergio Baranzini

April 2, 2015

RxNORM Ingredients

MESH Diseases

2.4. Automatic disease recognition

Indications with disease and ingredient names.

References