Here, we parse the SIDER 2 database (1) of side effects and indications. Since the data has been automatically extracted from drug labels, there are quality issues which we aim to address and resolve when possible. We use the term concept to refer to either a side effect or indication.

Load packages

We use caret and kernlab for machine learning; dplyr for data manipulation; ggplot2 and scales for plotting; DT for displaying html tables using the Javascript DataTables library. For multicore parallel processing, we use the doMC package which is not available on Windows and can be commented out. We make extensive use of the pipe operator (dplyr::%>%), which passes its lefthand value to the first argument of the righthand expression.

library(caret)
library(kernlab)
library(dplyr)
library(ggplot2)
library(scales)
library(DT)
library(doMC)

options(stringsAsFactors=FALSE)
doMC::registerDoMC(cores = 6)

write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
  write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
}

Read Unprocessed SIDER 2 Data

First, we download SIDER 2 data from their website. The three necessary files (adverse_effects_raw.tsv.gz, indications_raw.tsv.gz, label_mapping.tsv.gz) were reteived and and placed the download/ directory. See the SIDER 2 download README for file documentation.

Mode <- function(x) {
  # Returns the most common element in x. Only a single element is returned
  # in case of ties.
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

ReadRawSIDER <- function(path, type) {
  # Read adverse_effects_raw.tsv.gz or indications_raw.tsv.gz.
  # Count the occurrences of each concept in each label and
  # collapse label-concept pairs. 
  fieldnames <- c('label_id', 'concept_id', 'concept_name')
  path %>%
    read.table(sep='\t', col.names=fieldnames, comment.char='', quote='') %>%
    dplyr::group_by(label_id, concept_id) %>%
    dplyr::summarize(concept_name = Mode(concept_name), occurrences = n()) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(type = type)
}

# Read the raw drug label annotations
raw.df <- dplyr::bind_rows(
  file.path('..', 'download', 'adverse_effects_raw.tsv.gz') %>%
    ReadRawSIDER(type = 'side_effect'),
  file.path('..', 'download', 'indications_raw.tsv.gz') %>%
    ReadRawSIDER(type = 'indication'))

DataTable(raw.df, max.rows=200)

# Find the most commonly used name for each concept (indication/side effect)
concept.df <- raw.df %>%
  dplyr::group_by(concept_id) %>%
  dplyr::summarize(concept_name = Mode(concept_name))

DataTable(concept.df, max.rows=200)

# Read the compound to label mappings
fieldnames <- c('generic_names', 'brand_names', 'stitch_mapping',
                'stitch_id_flat', 'stitch_id_sterio', 'label_url', 'label_id')
label.df <- file.path('..', 'download', 'label_mapping.tsv.gz') %>%
  read.table(sep = '\t', col.names = fieldnames, na.strings = '', comment.char = '', quote = '') %>%
  dplyr::mutate(pubchem_cid = abs(stitch_id_sterio)) %>%
  dplyr::filter(! is.na(pubchem_cid))

label.df$pubchem_cid %>% unique() %>% sort() %>% data.frame() %>%
  write.table(file = file.path('..', 'data', 'sider_compounds_pubchem.txt'), row.names=FALSE, col.names=FALSE)

PubchemDataTable(label.df %>% dplyr::select(-stitch_mapping), max.rows=200)

Create a table of all indications and side effects for compounds that mapped to pubchem.

We calculated a variaty of descriptors for each compound-concept pair describing how it appeared in the raw SIDER data.

# Join label-specific side effects and indications to pubchem compounds
pair.df <- label.df %>%
  dplyr::select(pubchem_cid, label_id) %>%
  dplyr::distinct() %>%
  dplyr::inner_join(raw.df)

CollapseTypes <- function(df) {
  # Take a table of all the label-specific side effects and indications
  # for a single compound and collapse into a cross-label table,
  # so each concept only appears once for a given concept and type.
  total_labels <- dplyr::n_distinct(df$label_id)
  df %>%
    dplyr::group_by(type, concept_id) %>%
    dplyr::summarize(
      n_labels = n(),
      occurs_min = min(occurrences),
      #occurs_median = median(occurrences),
      occurs_max = max(occurrences),
      occurs_mean = mean(occurrences),
      occurs_mode = Mode(occurrences),
      occurs_sd = sd(occurrences),
      occurs_iqr = IQR(occurrences)
      ) %>%
    dplyr::mutate(total_labels = total_labels) %>%
    dplyr::mutate(percent_labels = n_labels / total_labels)
}

# Create a cross-label table of side effects and indications for
# all pubchem-mapped compounds.
collapsed.df <- pair.df %>%
  dplyr::group_by(pubchem_cid) %>%
  dplyr::do(CollapseTypes(.)) %>%
  dplyr::ungroup() %>%
  dplyr::left_join(concept.df) %>%
  dplyr::select(pubchem_cid, concept_id, concept_name, total_labels,
                type, n_labels, percent_labels, occurs_min:occurs_iqr)

Identify conflicting side effects and indications

We assume that if a concept is a side effect for a compound, it cannot also be a side effect and vice versa. In other words, if a compound cannot both cause and treat the same concept. Since the drug labels were machine-parsed, there were many examples of a concept being recognized as a side effect and also an indication.

# Find the conflicting side effects and indications:
# When a compound-concept pair has been extracted
# as both a side effect and indication.
conflict.df <- dplyr::intersect(
  dplyr::filter(collapsed.df, type == 'indication') %>%
    dplyr::select(pubchem_cid:total_labels),
  dplyr::filter(collapsed.df, type == 'side_effect') %>%
    dplyr::select(pubchem_cid:total_labels))

ColNameAppend <- function(df, s) {
  keep.df <- dplyr::select(df, pubchem_cid:total_labels)
  append.df <- dplyr::select(df, n_labels:occurs_iqr)
  colnames(append.df) <- paste0(colnames(append.df), '_', s)
  dplyr::bind_cols(keep.df, append.df)
}

conflict.df <- conflict.df %>%
  dplyr::inner_join(
    dplyr::filter(collapsed.df, type == 'indication') %>% ColNameAppend(s = 'ind')) %>%
  dplyr::inner_join(
    dplyr::filter(collapsed.df, type == 'side_effect') %>% ColNameAppend(s = 'se')) %>%
  dplyr::mutate(n_label_ratio = n_labels_ind / n_labels_se,
                percent_label_ratio = percent_labels_ind / percent_labels_se) %>%
  dplyr::left_join(
    collapsed.df %>% dplyr::group_by(concept_id) %>%
    dplyr::summarize(concept_n_ind = sum(type == 'indication'),
                     concept_n_se = sum(type == 'side_effect'))) %>% 
  dplyr::left_join(
    collapsed.df %>% dplyr::group_by(pubchem_cid) %>%
    dplyr::summarize(compound_n_ind = sum(type == 'indication'),
                     compound_n_se = sum(type == 'side_effect')))

Manually create a conflict gold standard

Overall, compound-concept pairs conflicted – extacted as both a side effect and indication. We manually classified a random subset of the conflicts as either side effects or indications. Based on our high-confidence manual classification – called a gold standard – we looked whether our descriptors could potentially be used to resolve conflicts.

conflict.gold.df <- file.path('..', 'gold', 'conflicts.txt') %>% 
  read.delim(na.strings = '', colClasses=c(category_manual='factor')) %>% 
  dplyr::select(pubchem_cid, concept_id, category_manual) %>% 
  dplyr::left_join(conflict.df)

ggplot(conflict.gold.df, aes(n_labels_se, n_labels_ind)) +
  geom_abline(intercept = 0, slope = 1, linetype='dashed', color='grey') +
  geom_point(aes(color=category_manual, size=total_labels), alpha = 0.6) +
  theme_bw() + coord_fixed() +
  scale_colour_manual(name = 'Manual Class', values = c('blue', 'red')) +
  scale_size_area(name='Total Labels', max_size = 7) +
  xlab('Side Effect Labels') + ylab('Indication Labels')

Pairs extracted as both an indication and side effect

Manual classification was performed for 130 side effect-indication conflicts. The prevalance of indications in this gold standard dataset was 70%.

Modeling conflict resolution

Using our descriptors as features, we see how accurately we can predict the actual status of a conflict. We used a support vector machine for classification.

GetX <- function(df) {
  X <- dplyr::select(.data = df, total_labels:compound_n_se)
  X[is.na(X)] <- 0
  return(X)
}

X.gold <- GetX(conflict.gold.df)
X.conflict <- GetX(conflict.df)
y.gold <- conflict.gold.df$category_manual

set.seed(0)
control <- caret::trainControl(method = 'repeatedcv', repeats=10, classProb=TRUE)
tune.grid <- expand.grid(.sigma = kernlab::sigest(x = as.matrix(X.gold), frac = 0.5, scaled = TRUE), .C = 2 ^ (-10:2))
conflict.model <- caret::train(x = X.gold, y = y.gold, preProcess = c('center', 'scale'), 
  method = 'svmRadial', metric = 'Kappa', trControl = control, tuneGrid = tune.grid)
ggplot(conflict.model) + theme_bw() + 
  scale_x_log10(breaks = scales::trans_breaks('log2', function(x) 2 ^ x),
                labels = scales::trans_format('log2', scales::math_format(2 ^ .x)))

SVM Parameter Optimization

y.conflict.df <- predict(conflict.model, newdata = X.conflict, type = 'prob')
conflict.df$prob_ind <- y.conflict.df$indication
conflict.df$prob_se <- y.conflict.df$side_effect

# Add category_manual column
conflict.df <- conflict.df %>% dplyr::left_join(
  y = conflict.gold.df %>% dplyr::select(pubchem_cid, concept_id, category_manual))

# Save the conflict.df
conflict.df %>%
  write.table(file = file.path('..', 'data', 'conflict.txt'), sep='\t', quote = FALSE, row.names=FALSE, na = '')

Using predictions to resolve conflicts

We used the probabilities generated by our model to resolve conflicts.

prob_cutoff_ind <- 0.75
prob_cutoff_se <- 0.75
stopifnot(prob_cutoff_ind + prob_cutoff_se >= 1)

indication_drop.df <- conflict.df %>%
  dplyr::filter(prob_ind < prob_cutoff_ind) %>%
  dplyr::mutate(type='indication')

side_effect_drop.df <- conflict.df %>%
  dplyr::filter(prob_se < prob_cutoff_se) %>%
  dplyr::mutate(type='side_effect')

conflict.stats <- list(
  n_ind = nrow(conflict.df) - nrow(indication_drop.df),
  n_se = nrow(conflict.df) - nrow(side_effect_drop.df),
  n_unresolved = nrow(indication_drop.df) + nrow(side_effect_drop.df) - nrow(conflict.df)
)

non_conflict.df <- collapsed.df %>%
  dplyr::anti_join(y = indication_drop.df, by = c('pubchem_cid', 'concept_id', 'type')) %>%
  dplyr::anti_join(y = side_effect_drop.df, by = c('pubchem_cid', 'concept_id', 'type'))

# set.seed(0)
# non_conflict.df %>% dplyr::filter(type == 'indication') %>% dplyr::sample_frac(1) %>%
#   write.delim(file = file.path('..', 'data', 'indications.txt'))
# non_conflict.df %>% dplyr::filter(type == 'side_effect') %>% dplyr::sample_frac(1) %>%
#   write.delim(file = file.path('..', 'data', 'side_effects.txt'))
non_conflict.df %>% 
  write.delim(file.path('..', 'data', 'sider2-processed.txt'))

Conflicting compound-concept pairs were classified as indications if \(Prob(Indication) \geq 0.75\) and as side effects if \(Prob(Side Effect) \geq 0.75\). Of the conflicts, 2143 were classified as indications, 510 were classified as side effects, and 1409 were unresolved and excluded as both indications and side effects.

After resolving conflicts, 105569 side effects remained covering 1035 compounds and 4854 side effects. Conversely, 10585 indications remained for 1006 compounds and 2168 diseases.

Side effect and Indication Gold Standards

Next, we manually classified a random subset of indications as true or false.

non_conflict.df <- non_conflict.df %>%
  dplyr::inner_join(
    non_conflict.df %>% dplyr::group_by(concept_id) %>%
    dplyr::summarize(concept_n_ind = sum(type == 'indication'),
                     concept_n_se = sum(type == 'side_effect'))) %>% 
  dplyr::inner_join(
    non_conflict.df %>% dplyr::group_by(pubchem_cid) %>%
    dplyr::summarize(compound_n_ind = sum(type == 'indication'),
                     compound_n_se = sum(type == 'side_effect')))

gold.ind.df <- file.path('..', 'gold', 'indications.txt') %>% 
  read.delim(na.strings = '', colClasses=c(category_manual='factor')) %>% 
  dplyr::left_join(non_conflict.df)
  
gold.se.df <- file.path('..', 'gold', 'side_effects.txt') %>% 
  read.delim(na.strings = '', colClasses=c(category_manual='factor')) %>% 
  dplyr::left_join(non_conflict.df)

prop.test.ind <- sum(gold.ind.df$category_manual == 1) %>% prop.test(n = nrow(gold.ind.df))
prop.test.se <- sum(gold.se.df$category_manual == 1) %>% prop.test(n = nrow(gold.se.df))
prec.stats <- list(
  'ind.prec.est' = 100 * prop.test.ind$estimate,
  'ind.prec.lower' = 100 * prop.test.ind$conf.int[1],
  'ind.prec.upper' = 100 * prop.test.ind$conf.int[2],
  'se.prec.est' = 100 * prop.test.se$estimate,
  'se.prec.lower' = 100 * prop.test.se$conf.int[1],
  'se.prec.upper' = 100 * prop.test.se$conf.int[2]
) %>% lapply(format, digits = 3)

We manually evaluated 101 random indications and 100 random side effects to assess the precision of automated side effect and indication extraction. The precision of indications was 63.4% [95% CI: 53.1–72.6%]. The precision of side effects was 92% [95% CI: 84.4–96.2%]

Can we predict which indications are false positives?

X.gold <- GetX(gold.se.df)
y.gold <- as.factor(gold.se.df$category_manual)

set.seed(0)
control <- caret::trainControl(method = 'repeatedcv', repeats=10, classProb=TRUE)
tune.grid <- expand.grid(.sigma = kernlab::sigest(x = as.matrix(X.gold), frac = 0.5, scaled = TRUE), .C = 2 ^ (-6:6))
ind.model <- caret::train(x = X.gold, y = y.gold, preProcess = c('center', 'scale'), 
  method = 'svmRadial', metric = 'Kappa', trControl = control, tuneGrid = tune.grid)

## maximum number of iterations reached 0.0006709571 -0.0006589537

ggplot(ind.model) + theme_bw() + ylim(c(-0.01, 0.1)) +
  scale_x_log10(breaks = scales::trans_breaks('log2', function(x) 2 ^ x),
                labels = scales::trans_format('log2', scales::math_format(2 ^ .x)))

Vizualizing Side Effects and Indications

# vizualizing pairs
non_conflict.df %>%
  ggplot(aes(x = as.character(pubchem_cid), y = concept_id)) + theme_bw() +
    facet_grid(type ~ ., scales = 'free_y', space = 'free_y') +
    geom_tile(fill= 'black') +
    xlab('Compound') + ylab(NULL) +
    scale_x_discrete(breaks=NULL, expand = c(0.02, 0)) + scale_y_discrete(breaks=NULL, expand = c(0.04, 0))

Sample of processed output

# Display as a javascript datatable
non_conflict.df %>%
  dplyr::select(pubchem_cid:concept_name, n_labels, total_labels, type) %>%
  PubchemDataTable(max.rows=200)

Multiple Sclerosis Indications

non_conflict.df %>%
  dplyr::select(pubchem_cid:concept_name, n_labels, total_labels, type) %>%
  dplyr::filter(concept_name == 'multiple sclerosis') %>%
  PubchemDataTable()

References

1. Kuhn M, Campillos M, Letunic I, Jensen LJ, Bork P (2010) A side effect resource to capture phenotypic effects of drugs. Mol Syst Biol. doi:10.1038/msb.2009.98

To the extent possible under law, Daniel Himmelstein has waived all copyright and related or neighboring rights to SIDER 2. This work is published from: United States.

Parsing SIDER 2

Daniel Himmelstein, Leo Brueggeman, Sergio Baranzini

February 3, 2015