library(magrittr)
tran_df = readr::read_tsv('data/matrix/rephetio-v2.0/transformed-features.tsv.bz2') %>%
dplyr::select(-matches('dwpc.+[CD][pt][CD]'))
tran_df %>% tail(2)
compound_id | disease_id | status | disease_name | compound_name | prior_logit | degree_CbG | degree_CcSE | degree_CdG | degree_CiPC | ellip.h | rdwpc_CuGuDrDrD | dwpc_CuGuDuGaD | pdwpc_CuGuDuGaD | rdwpc_CuGuDuGaD | dwpc_CuGuDuGdD | pdwpc_CuGuDuGdD | rdwpc_CuGuDuGdD | dwpc_CuGuDuGuD | pdwpc_CuGuDuGuD | rdwpc_CuGuDuGuD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | DB00550 | DOID:1793 | 0 | pancreatic cancer | Propylthiouracil | -4.38 | 2.0947 | 4.6822 | 3.0931 | 0.88137 | ⋯ | 0.2925 | 0.45171 | 0.4277 | 0.024005 | 0.59664 | 1.1107 | -0.5141 | 1.546 | 1.1833 | 0.36269 |
2 | DB01409 | DOID:1793 | 0 | pancreatic cancer | Tiotropium | -3.6497 | 2.7765 | 5.3936 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# auroc_df = readr::read_tsv('data/feature-performance/auroc.tsv') %>%
# dplyr::filter(! grepl('[CD][pt][CD]', feature))
X_dwpc = tran_df %>%
dplyr::select(prior_logit, starts_with('degree'), starts_with('dwpc')) %>%
as.matrix
X_pdwpc = tran_df %>%
dplyr::select(prior_logit, starts_with('degree'), starts_with('pdwpc')) %>%
as.matrix
X_rdwpc = tran_df %>%
dplyr::select(prior_logit, starts_with('degree'), starts_with('rdwpc')) %>%
as.matrix
X_split = tran_df %>%
dplyr::select(prior_logit, starts_with('degree'), starts_with('pdwpc'), starts_with('rdwpc')) %>%
as.matrix
X_all = tran_df %>%
dplyr::select(prior_logit, starts_with('degree'), contains('dwpc')) %>%
as.matrix
dim(X_rdwpc)
dim(X_split)
dim(X_all)
penalty = ifelse(colnames(X_dwpc) == 'prior_logit', 0, 1)
fit_dwpc = hetior::glmnet_train(X_dwpc, tran_df$status, alpha = 0, penalty.factor=penalty, cores=10)
fit_pdwpc = hetior::glmnet_train(X_pdwpc, tran_df$status, alpha = 0, penalty.factor=penalty, cores=10)
fit_rdwpc = hetior::glmnet_train(X_rdwpc, tran_df$status, alpha = 0, penalty.factor=penalty, cores=10)
Loading required package: Matrix Loading required package: foreach Loaded glmnet 2.0-5
penalty = ifelse(colnames(X_split) == 'prior_logit', 0, 1)
fit_split = hetior::glmnet_train(X_split, tran_df$status, alpha = 0, penalty.factor=penalty, cores = 10)
penalty = ifelse(colnames(X_all) == 'prior_logit', 0, 1)
fit_all = hetior::glmnet_train(X_all, tran_df$status, alpha = 0, penalty.factor=penalty, cores = 10)
fit_dwpc$vtm[c('auroc', 'auprc', 'tjur')]
fit_pdwpc$vtm[c('auroc', 'auprc', 'tjur')]
fit_rdwpc$vtm[c('auroc', 'auprc', 'tjur')]
fit_split$vtm[c('auroc', 'auprc', 'tjur')]
fit_all$vtm[c('auroc', 'auprc', 'tjur')]
uniform_prior = boot::logit(mean(tran_df$status))
X_dwpc[, 'prior_logit'] = uniform_prior
X_pdwpc[, 'prior_logit'] = uniform_prior
X_rdwpc[, 'prior_logit'] = uniform_prior
X_split[, 'prior_logit'] = uniform_prior
X_all[, 'prior_logit'] = uniform_prior
pred_df = tran_df %>%
dplyr::mutate(prior = boot::inv.logit(prior_logit)) %>%
dplyr::select(disease_name, compound_name, status, prior) %>%
dplyr::mutate(dwpc = hetior::glmnet_predict(fit_dwpc$cv_model, X = X_dwpc)) %>%
dplyr::mutate(pdwpc = hetior::glmnet_predict(fit_pdwpc$cv_model, X = X_pdwpc)) %>%
dplyr::mutate(rdwpc = hetior::glmnet_predict(fit_rdwpc$cv_model, X = X_rdwpc)) %>%
dplyr::mutate(split = hetior::glmnet_predict(fit_split$cv_model, X = X_split)) %>%
dplyr::mutate(all = hetior::glmnet_predict(fit_all$cv_model, X = X_all))
# pred_df = tran_df %>%
# dplyr::mutate(prior = boot::inv.logit(prior_logit)) %>%
# dplyr::select(disease_name, compound_name, status, prior) %>%
# dplyr::mutate(dwpc = fit_dwpc$y_pred) %>%
# dplyr::mutate(pdwpc = fit_pdwpc$y_pred) %>%
# dplyr::mutate(rdwpc = fit_rdwpc$y_pred) %>%
# dplyr::mutate(split = fit_split$y_pred) %>%
# dplyr::mutate(all = fit_all$y_pred)
pred_df %>% head(2)
disease_name | compound_name | status | prior | dwpc | pdwpc | rdwpc | split | all | |
---|---|---|---|---|---|---|---|---|---|
1 | lymphatic system cancer | Cyclosporine | 0 | 0.04741684 | 0.7308761 | 0.7337899 | 0.7956562 | 0.7747835 | 0.7524858 |
2 | lymphatic system cancer | Reserpine | 0 | 0.01106874 | 0.683073 | 0.7337899 | 0.7239236 | 0.7374077 | 0.7174384 |
# Pairwise correlations
pred_df %>%
dplyr::select(prior:all) %>%
cor() %>%
round(3) * 100
Warning message: In cor(.): the standard deviation is zero
prior | dwpc | pdwpc | rdwpc | split | all | |
---|---|---|---|---|---|---|
prior | 100.0 | 16.1 | NA | 14.8 | 10.4 | 11.5 |
dwpc | 16.1 | 100.0 | NA | 88.5 | 87.0 | 91.0 |
pdwpc | NA | NA | 100 | NA | NA | NA |
rdwpc | 14.8 | 88.5 | NA | 100.0 | 96.5 | 96.5 |
split | 10.4 | 87.0 | NA | 96.5 | 100.0 | 99.4 |
all | 11.5 | 91.0 | NA | 96.5 | 99.4 | 100.0 |
pred_long_df = pred_df %>%
tidyr::gather(predictor_set, prediction, prior:all) %>%
dplyr::mutate(prediction = prediction * 100)
pair_df = dplyr::inner_join(
pred_long_df %>%
dplyr::rename(predictor_set_a = predictor_set, prediction_a = prediction),
pred_long_df %>%
dplyr::rename(predictor_set_b = predictor_set, prediction_b = prediction)
)
pair_df %>%
ggplot2::ggplot(ggplot2::aes(prediction_a, prediction_b)) +
ggplot2::facet_grid(predictor_set_b ~ predictor_set_a) +
ggplot2::geom_hex() +
ggplot2::coord_equal() +
ggplot2::theme_bw() +
viridis::scale_fill_viridis(trans = 'log10')
Joining by: c("disease_name", "compound_name", "status")
pred_long_df %>%
dplyr::group_by(predictor_set) %>%
dplyr::summarize(
auroc = hetior::calc_vtms(status, prediction, T)$auroc,
auprc = hetior::calc_vtms(status, prediction, T)$auprc,
tjur = hetior::calc_vtms(status, prediction, T)$tjur
)
predictor_set | auroc | auprc | tjur | |
---|---|---|---|---|
1 | all | 0.8084097 | 0.5267688 | 13.02163 |
2 | dwpc | 0.8184448 | 0.5423078 | 14.78216 |
3 | pdwpc | 0.5 | NA | 0 |
4 | prior | 0.8481643 | 0.6083106 | 8.698848 |
5 | rdwpc | 0.811467 | 0.5281131 | 12.40107 |
6 | split | 0.8042283 | 0.5195282 | 12.41278 |
head(pred_long_df)
disease_name | compound_name | status | predictor_set | prediction | |
---|---|---|---|---|---|
1 | lymphatic system cancer | Cyclosporine | 0 | prior | 4.741684 |
2 | lymphatic system cancer | Reserpine | 0 | prior | 1.106874 |
3 | lymphatic system cancer | Citalopram | 0 | prior | 1.106874 |
4 | lymphatic system cancer | Pregabalin | 0 | prior | 1.106874 |
5 | lymphatic system cancer | Carmustine | 1 | prior | 6.043208 |
6 | lymphatic system cancer | Bleomycin | 1 | prior | 7.389106 |
fit_dwpc$coef_df %>%
dplyr::arrange(zcoef) %>%
dplyr::filter(abs(zcoef) > 0.1)
feature | coef | zcoef | |
---|---|---|---|
1 | intercept | 2.612172 | -2.121635 |
2 | dwpc_CbGpPWpGaD | 0.1561111 | 0.1018458 |
3 | dwpc_CbGaDrD | 0.1644525 | 0.1255166 |
4 | dwpc_CrCbGaD | 0.1705254 | 0.1405208 |
5 | dwpc_CbGaD | 0.2958752 | 0.2476545 |
6 | prior_logit | 1.043714 | 1.582035 |
plot(fit_pdwpc$cv_model)
plot(fit_all$cv_model)