library(dplyr)
library(DT)
library(ggplot2)
options(stringsAsFactors=FALSE)
Symptoms from the HSDN (Human symptoms–disease network)
# read Disease Ontology to MESH mapping
mesh.df <-
#'http://git.dhimmel.com/disease-ontology/data/xrefs-prop-slim.tsv' %>%
'http://git.dhimmel.com/disease-ontology/data/xrefs-slim.tsv' %>%
read.delim() %>%
dplyr::filter(resource == 'MSH') %>%
dplyr::select(-resource) %>%
dplyr::rename(disease_id = resource_id)
# read HSDN supplement that was mapped to MESH
hsdn.df <-
'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Combined-Output.tsv' %>%
read.delim(check.names = FALSE, row.names=1) %>%
dplyr::rename(
symptom_id = `MeSH Symptom ID`,
symptom_name = `MeSH Symptom Term`,
disease_id = `MeSH Disease ID`,
disease_name = `MeSH Disease Term`,
tfidf_score = `TFIDF score`,
cooccurs = `PubMed occurrence`
)
# add MESH to hsdn.df
hsdn.df <- hsdn.df %>%
dplyr::inner_join(mesh.df)
## Joining by: "disease_id"
path <- file.path('data', 'symptoms-DO.tsv')
write.table(hsdn.df, path, sep='\t', row.names=FALSE, quote=FALSE)
DT::datatable(hsdn.df %>% dplyr::filter(tfidf_score >= 25))
# Distribution of tfidf_scores
hsdn.df %>%
ggplot(aes(x = tfidf_score)) +
geom_histogram(alpha=0.6) +
scale_x_log10() + theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# Distribution of tfidf_scores (zoomed in)
hsdn.df %>%
ggplot(aes(x = tfidf_score)) +
geom_histogram(alpha=0.6, binwidth=1) +
xlim(c(0, 50)) +
theme_bw()
# distribution of symptom scores by DO slim disease
hsdn.df %>%
ggplot(aes(x = doid_name, y = tfidf_score)) +
geom_violin(fill='darkgreen', color=NA) +
#geom_jitter(alpha = 0.4) +
scale_y_log10() +
theme_bw() +
coord_flip()
# number of symptoms (and percent with tfidf_score >= 25) per DO slim disease
hsdn.df %>%
dplyr::group_by(doid_code, doid_name) %>%
dplyr::summarize(
n_symptoms = n(),
number_above_25 = sum(tfidf_score >= 25),
percent_above_25 = round(100 * mean(tfidf_score >= 25), 3)
) %>%
DT::datatable()