Here, we parse the SIDER 2 database (1) of side effects and indications. Since the data has been automatically extracted from drug labels, there are quality issues which we aim to address and resolve when possible. We use the term concept to refer to either a side effect or indication.

Load packages

We use caret and kernlab for machine learning; dplyr for data manipulation; ggplot2 and scales for plotting; DT for displaying html tables using the Javascript DataTables library. For multicore parallel processing, we use the doMC package which is not available on Windows and can be commented out. We make extensive use of the pipe operator (dplyr::%>%), which passes its lefthand value to the first argument of the righthand expression.

library(caret)
library(kernlab)
library(dplyr)
library(ggplot2)
library(scales)
library(DT)
library(doMC)

options(stringsAsFactors=FALSE)
doMC::registerDoMC(cores = 6)

write.delim <- function(x, file, sep='\t', quote = FALSE, row.names=FALSE, na = '', ...) {
  write.table(x = x, file = file, sep=sep, quote=quote, row.names=row.names, na=na, ...)
}

Read Unprocessed SIDER 2 Data

First, we download SIDER 2 data from their website. The three necessary files (adverse_effects_raw.tsv.gz, indications_raw.tsv.gz, label_mapping.tsv.gz) were reteived and and placed the download/ directory. See the SIDER 2 download README for file documentation.

Mode <- function(x) {
  # Returns the most common element in x. Only a single element is returned
  # in case of ties.
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

ReadRawSIDER <- function(path, type) {
  # Read adverse_effects_raw.tsv.gz or indications_raw.tsv.gz.
  # Count the occurrences of each concept in each label and
  # collapse label-concept pairs. 
  fieldnames <- c('label_id', 'concept_id', 'concept_name')
  path %>%
    read.table(sep='\t', col.names=fieldnames, comment.char='', quote='') %>%
    dplyr::group_by(label_id, concept_id) %>%
    dplyr::summarize(concept_name = Mode(concept_name), occurrences = n()) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(type = type)
}

# Read the raw drug label annotations
raw.df <- dplyr::bind_rows(
  file.path('..', 'download', 'adverse_effects_raw.tsv.gz') %>%
    ReadRawSIDER(type = 'side_effect'),
  file.path('..', 'download', 'indications_raw.tsv.gz') %>%
    ReadRawSIDER(type = 'indication'))

DataTable(raw.df, max.rows=200)

# Find the most commonly used name for each concept (indication/side effect)
concept.df <- raw.df %>%
  dplyr::group_by(concept_id) %>%
  dplyr::summarize(concept_name = Mode(concept_name))

DataTable(concept.df, max.rows=200)

# Read the compound to label mappings
fieldnames <- c('generic_names', 'brand_names', 'stitch_mapping',
                'stitch_id_flat', 'stitch_id_sterio', 'label_url', 'label_id')
label.df <- file.path('..', 'download', 'label_mapping.tsv.gz') %>%
  read.table(sep = '\t', col.names = fieldnames, na.strings = '', comment.char = '', quote = '') %>%
  dplyr::mutate(pubchem_cid = abs(stitch_id_sterio)) %>%
  dplyr::filter(! is.na(pubchem_cid))

label.df$pubchem_cid %>% unique() %>% sort() %>% data.frame() %>%
  write.table(file = file.path('..', 'data', 'sider_compounds_pubchem.txt'), row.names=FALSE, col.names=FALSE)

PubchemDataTable(label.df %>% dplyr::select(-stitch_mapping), max.rows=200)

Create a table of all indications and side effects for compounds that mapped to pubchem.

We calculated a variaty of descriptors for each compound-concept pair describing how it appeared in the raw SIDER data.

# Join label-specific side effects and indications to pubchem compounds
pair.df <- label.df %>%
  dplyr::select(pubchem_cid, label_id) %>%
  dplyr::distinct() %>%
  dplyr::inner_join(raw.df)

CollapseTypes <- function(df) {
  # Take a table of all the label-specific side effects and indications
  # for a single compound and collapse into a cross-label table,
  # so each concept only appears once for a given concept and type.
  total_labels <- dplyr::n_distinct(df$label_id)
  df %>%
    dplyr::group_by(type, concept_id) %>%
    dplyr::summarize(
      n_labels = n(),
      occurs_min = min(occurrences),
      #occurs_median = median(occurrences),
      occurs_max = max(occurrences),
      occurs_mean = mean(occurrences),
      occurs_mode = Mode(occurrences),
      occurs_sd = sd(occurrences),
      occurs_iqr = IQR(occurrences)
      ) %>%
    dplyr::mutate(total_labels = total_labels) %>%
    dplyr::mutate(percent_labels = n_labels / total_labels)
}

# Create a cross-label table of side effects and indications for
# all pubchem-mapped compounds.
collapsed.df <- pair.df %>%
  dplyr::group_by(pubchem_cid) %>%
  dplyr::do(CollapseTypes(.)) %>%
  dplyr::ungroup() %>%
  dplyr::left_join(concept.df) %>%
  dplyr::select(pubchem_cid, concept_id, concept_name, total_labels,
                type, n_labels, percent_labels, occurs_min:occurs_iqr)

Identify conflicting side effects and indications

We assume that if a concept is a side effect for a compound, it cannot also be a side effect and vice versa. In other words, if a compound cannot both cause and treat the same concept. Since the drug labels were machine-parsed, there were many examples of a concept being recognized as a side effect and also an indication.

# Find the conflicting side effects and indications:
# When a compound-concept pair has been extracted
# as both a side effect and indication.
conflict.df <- dplyr::intersect(
  dplyr::filter(collapsed.df, type == 'indication') %>%
    dplyr::select(pubchem_cid:total_labels),
  dplyr::filter(collapsed.df, type == 'side_effect') %>%
    dplyr::select(pubchem_cid:total_labels))

ColNameAppend <- function(df, s) {
  keep.df <- dplyr::select(df, pubchem_cid:total_labels)
  append.df <- dplyr::select(df, n_labels:occurs_iqr)
  colnames(append.df) <- paste0(colnames(append.df), '_', s)
  dplyr::bind_cols(keep.df, append.df)
}

conflict.df <- conflict.df %>%
  dplyr::inner_join(
    dplyr::filter(collapsed.df, type == 'indication') %>% ColNameAppend(s = 'ind')) %>%
  dplyr::inner_join(
    dplyr::filter(collapsed.df, type == 'side_effect') %>% ColNameAppend(s = 'se')) %>%
  dplyr::mutate(n_label_ratio = n_labels_ind / n_labels_se,
                percent_label_ratio = percent_labels_ind / percent_labels_se) %>%
  dplyr::left_join(
    collapsed.df %>% dplyr::group_by(concept_id) %>%
    dplyr::summarize(concept_n_ind = sum(type == 'indication'),
                     concept_n_se = sum(type == 'side_effect'))) %>% 
  dplyr::left_join(
    collapsed.df %>% dplyr::group_by(pubchem_cid) %>%
    dplyr::summarize(compound_n_ind = sum(type == 'indication'),
                     compound_n_se = sum(type == 'side_effect')))

Manually create a conflict gold standard

Overall, compound-concept pairs conflicted – extacted as both a side effect and indication. We manually classified a random subset of the conflicts as either side effects or indications. Based on our high-confidence manual classification – called a gold standard – we looked whether our descriptors could potentially be used to resolve conflicts.

conflict.gold.df <- file.path('..', 'gold', 'conflicts.txt') %>% 
  read.delim(na.strings = '', colClasses=c(category_manual='factor')) %>% 
  dplyr::select(pubchem_cid, concept_id, category_manual) %>% 
  dplyr::left_join(conflict.df)

ggplot(conflict.gold.df, aes(n_labels_se, n_labels_ind)) +
  geom_abline(intercept = 0, slope = 1, linetype='dashed', color='grey') +
  geom_point(aes(color=category_manual, size=total_labels), alpha = 0.6) +
  theme_bw() + coord_fixed() +
  scale_colour_manual(name = 'Manual Class', values = c('blue', 'red')) +
  scale_size_area(name='Total Labels', max_size = 7) +
  xlab('Side Effect Labels') + ylab('Indication Labels')