File size: 15,333 Bytes
7718235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
genes <- c("Q09428", "P15056", "O00555", "P21802",
           "Q14654", "P07949", "Q99250", "Q14524.clean", "P04637")
gene.names <- c("ABCC8", "BRAF", "CACNA1A", "FGFR2",
                "KCNJ11", "RET", "SCN2A", "SCN5A", "TP53")
py.path = '/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python'
source('./AUROC.R')
summary.df <- data.frame()
plots <- list()
source('./prepare.biochem.R')
ALL <- read.csv('figs/ALL.csv', row.names = 1, na.strings = c('.', 'NA'))
ALL <- prepare.unique.id(ALL)
pick.cond <- 'auc'
for (i in 1:length(genes)) {
  gene <- genes[i]
  for (subset in c(1,2,4,6,8)) {
    for (fold in 0:4) {
      aucs <- c()
      if (subset == 8) {
        gene.test.res <- read.csv(paste0('PreMode/', gene, '/testing.fold.' ,fold, '.4fold.csv'))
        log.yaml <- yaml::read_yaml(paste0('../scripts/PreMode/',
                                           gene, '.5fold/', gene, '.fold.', fold, '.yaml'))
        # compare with large window
        gene.test.res.lw <- read.csv(paste0('PreMode/', gene, '.large.window/testing.fold.' ,fold, '.4fold.csv'))
        gene.train.res <- read.csv(paste0('PreMode/', gene, '/training.fold.' ,fold, '.4fold.csv'))
        gene.train.res.lw <- read.csv(paste0('PreMode/', gene, '.large.window/training.fold.' ,fold, '.4fold.csv'))
      } else {
        gene.test.res <- read.csv(paste0('PreMode/', gene, '/testing.subset.', subset, '.fold.' ,fold, '.4fold.csv'))
        log.yaml <- yaml::read_yaml(paste0('../scripts/PreMode/',
                                           gene, '.subset.', subset, '.5fold/', gene, '.subset.', subset, '.fold.', fold, '.yaml'))
        # compare with large window
        gene.test.res.lw <- read.csv(paste0('PreMode/', gene, '.large.window/testing.subset.', subset, '.fold.' ,fold, '.4fold.csv'))
        gene.train.res <- read.csv(paste0('PreMode/', gene, '/training.subset.', subset, '.fold.' ,fold, '.4fold.csv'))
        gene.train.res.lw <- read.csv(paste0('PreMode/', gene, '.large.window/training.subset.', subset, '.fold.' ,fold, '.4fold.csv'))
      }
      tr.auc <- plot.AUC(gene.train.res$score, rowMeans(gene.train.res[,paste0('logits.FOLD.', 0:3)]))$auc
      tr.auc.lw <- plot.AUC(gene.train.res.lw$score, rowMeans(gene.train.res.lw[,paste0('logits.FOLD.', 0:3)]))$auc
      tr.loss <- rowMeans(gene.train.res[,paste0('min_loss.FOLD.', 0:3)])[1]
      tr.loss.lw <- rowMeans(gene.train.res.lw[,paste0('min_loss.FOLD.', 0:3)])[1]
      if (pick.cond == 'auc') {
        cond <- tr.auc.lw > tr.auc
      } else if (pick.cond == 'loss') {
        cond <- tr.loss > tr.loss.lw
      } else if (pick.cond == 'auc+loss') {
        cond <- tr.auc.lw/tr.loss.lw > tr.auc/tr.loss
      } else {
        cond <- F
      }
      # do 4 fold auc
      if (cond) {
        auc <- plot.AUC(gene.test.res.lw$score, rowMeans(gene.test.res.lw[,paste0('logits.FOLD.', 0:3)]))
      } else {
        auc <- plot.AUC(gene.test.res$score, rowMeans(gene.test.res[,paste0('logits.FOLD.', 0:3)]))
      }
      aucs <- c(aucs, auc$auc)
      # do random forest
      gene.train <- read.csv(paste0('../', log.yaml$data_file_train))
      gene.test <- read.csv(paste0('../', log.yaml$data_file_test))
      # get the same training/val split
      fold.splits <- reticulate::py_load_object(paste0('../', log.yaml$log_dir, '/fold_split.pkl'))
      # prepare unique id
      gene.train <- prepare.unique.id(gene.train)
      gene.test <- prepare.unique.id(gene.test)
      train.biochem <- prepare.biochemical(ALL[match(gene.train$unique.id, ALL$unique.id),])
      test.biochem <- prepare.biochemical(ALL[match(gene.test$unique.id, ALL$unique.id),])
      rownames(train.biochem) <- gene.train[,1]
      rownames(test.biochem) <- gene.test[,1]
      rf.aucs <- c()
      for (f in 1:4) {
        # get split info
        val.gof.idx <- fold.splits[[1]][[f]]
        val.lof.idx <- fold.splits[[2]][[f]]
        train.idx <- !gene.train[,1] %in% c(val.gof.idx, val.lof.idx)
        # call python on elastic net
        train.biochem.file <- tempfile()
        test.biochem.file <- tempfile()
        train.label.file <- tempfile()
        test.label.file <- tempfile()
        output.file <- tempfile()
        write.csv(train.biochem[train.idx,],
                  file = train.biochem.file)
        write.csv(test.biochem, 
                  file = test.biochem.file)
        write.csv(gene.train[train.idx,], file = train.label.file)
        write.csv(gene.test, file = test.label.file)
        # call python on random forest
        res <- system(paste0(py.path, ' ', 
                             'random.forest.glof.py ', 
                             train.biochem.file, ' ',
                             train.label.file, ' ',
                             test.biochem.file, ' ', 
                             test.label.file), intern = T)
        rf.aucs <- c(rf.aucs, as.numeric(strsplit(res, split = '=')[[1]][2]))
      }
      aucs <- c(aucs, mean(el.aucs), mean(rf.aucs))
      summary.df <- rbind(summary.df, 
                          data.frame(auc=aucs,
                                     use.lw=c(cond, NA),
                                     model=c('PreMode.transfer', 'random.forest'),
                                     seed=fold,
                                     gene=gene.names[i],
                                     subset=subset,
                                     ngof.train=sum(gene.train$score==1),
                                     nlof.train=sum(gene.train$score==-1),
                                     ngof.test=sum(gene.test$score==1),
                                     nlof.test=sum(gene.test$score==-1)))
    }
  }
}
write.csv(summary.df, file = 'figs/fig.5e.prepare.csv')
library(ggplot2)

summary.df <- read.csv('figs/fig.5e.prepare.csv', row.names = 1)
plots <- list()
library(patchwork)
for (i in 1:length(genes)) {
  task <- gene.names[i]
  task.res <- summary.df[startsWith(summary.df$gene, task),]
  task.res <- task.res[,!is.na(task.res[1,])]
  task.plots <- list()
  data.points <- paste0(task.res$ngof.train[task.res$seed==0 & task.res$model=="PreMode.transfer"],
                        " | ",
                        task.res$nlof.train[task.res$seed==0 & task.res$model=="PreMode.transfer"])
  num.models <- length(unique(summary.df$model))
  p <- ggplot(task.res, aes(x=subset, y=auc, col=model)) + 
    geom_point(alpha=0.2) + 
    # geom_line(aes(y=zero.shot), linetype="dotted") + 
    stat_smooth(geom='line', span=0.3, se = FALSE, alpha=0.5) + scale_y_continuous(breaks=seq(0.4, 1, 0.2), limits = c(0.4, 1.0)) +
    scale_x_continuous(breaks=c(1, 2, 4, 6, 8),
                       labels=paste0(data.points,
                                     c(" (10%)", " (20%)", " (40%)", " (60%)", " (80%)"))) +
    stat_summary(data = task.res,
                 aes(x=as.numeric((subset))+0.4*(as.numeric((model)))/num.models-0.2*(num.models+1)/num.models,
                     y = auc, col=model), 
                 fun.data = mean_se, geom = "errorbar", width = 0.2) +
    stat_summary(data = task.res, 
                 aes(x=as.numeric((subset))+0.4*(as.numeric((model)))/num.models-0.2*(num.models+1)/num.models,
                     y = auc, col=model), 
                 fun.data = mean_se, geom = "point") +
    theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    ggtitle(paste0(task)) + ggeasy::easy_center_title() + xlab("training data size, format: GoF | LoF (%)")
  plots[[i]] <- p
}
library(patchwork)
p <- plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] + plots[[9]] + plot_layout(ncol=3)

summary.df <- read.csv('figs/fig.5e.prepare.csv', row.names = 1)
summary.df <- summary.df[summary.df$model %in% c('PreMode.transfer', 'random.forest'),]
model.dic <- c("PreMode.transfer"="Supervised: PreMode", 
               "random.forest"="Supervised: Random Forest")
summary.df$model <- model.dic[summary.df$model]
summary.df$model <- factor(summary.df$model, levels = c("Supervised: PreMode", 
                                                        "Supervised: Random Forest"))
gene.names <- unique(summary.df$gene)

plots <- list()
library(patchwork)
for (i in 1:length(genes)) {
  task <- gene.names[i]
  task.res <- summary.df[startsWith(summary.df$gene, task),]
  task.res <- task.res[,!is.na(task.res[1,])]
  task.plots <- list()
  data.points <- paste0(task.res$ngof.train[task.res$seed==0 & task.res$model=="Supervised: PreMode"],
                        " | ",
                        task.res$nlof.train[task.res$seed==0 & task.res$model=="Supervised: PreMode"])
  num.models <- length(unique(summary.df$model))
  p <- ggplot(task.res, aes(x=subset, y=auc, col=model)) + 
    geom_point(alpha=0) + 
    # geom_line(aes(y=zero.shot), linetype="dotted") + 
    stat_smooth(geom='line', span=0.3, se = FALSE, alpha=0.5) + 
    scale_y_continuous(breaks=seq(0.4, 1, 0.2), limits = c(0.4, 1.0)) +
    scale_x_continuous(breaks=c(1, 2, 4, 6, 8),
                       labels=paste0(data.points,
                                     c(" (10%)", " (20%)", " (40%)", " (60%)", " (80%)"))) +
    stat_summary(data = task.res,
                 aes(x=as.numeric((subset))+0.4*(as.numeric((model)))/num.models-0.2*(num.models+1)/num.models,
                     y = auc, col=model), 
                 fun.data = mean_se, geom = "errorbar", width = 0.2) +
    stat_summary(data = task.res, 
                 aes(x=as.numeric((subset))+0.4*(as.numeric((model)))/num.models-0.2*(num.models+1)/num.models,
                     y = auc, col=model), 
                 fun.data = mean_se, geom = "point") +
    theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1),
                       legend.position="bottom", 
                       legend.direction="horizontal") +
    ggtitle(paste0(task)) + ggeasy::easy_center_title() + xlab("training data size, format: GoF | LoF (%)")
  if (i != 5) {
    p <- p + guides(color=FALSE)
  }
  plots[[i]] <- p
}
library(ggpubr)
p <- ggarrange(plots[[6]], plots[[5]], plots[[3]], 
               plots[[2]], plots[[8]], plots[[7]], 
               plots[[9]], plots[[1]], plots[[4]], 
          ncol=3, nrow=3, common.legend = TRUE, legend="bottom")

# plot weighted average
summary.df <- read.csv('figs/fig.5e.prepare.csv', row.names = 1)
summary.df <- summary.df[summary.df$model %in% c('PreMode.transfer', 'random.forest'),]
model.dic <- c("PreMode.transfer"="PreMode", 
               "random.forest"="Random Forest")
summary.df$model <- model.dic[summary.df$model]
summary.df$model <- factor(summary.df$model, levels = c("PreMode",  "Random Forest"))
# plot the task weighted averages as well as task size weighted error bars
uniq.result.plot <- summary.df[summary.df$seed==0,]
for (i in 1:dim(uniq.result.plot)[1]) {
  aucs <- summary.df$auc[summary.df$model==uniq.result.plot$model[i] & 
                           summary.df$gene==uniq.result.plot$gene[i] &
                           summary.df$subset==uniq.result.plot$subset[i]]
  # aucs <- aucs[aucs > 0]
  uniq.result.plot$auc[i] = mean(aucs, na.rm=T)
  uniq.result.plot$auc.se[i] = sd(aucs, na.rm=T) / sqrt(length(aucs))
}
task.dic <- unique(uniq.result.plot$gene)
plots <- list()
num.models <- unique(uniq.result.plot$model)
library(patchwork)
for (i in 1:length(task.dic)) {
  task <- (genes)[i]
  task.res <- uniq.result.plot[uniq.result.plot$gene == gene.names[i],]
  task.res <- task.res[,!is.na(task.res[1,])]
  data.points <- paste0(task.res$ngof.train[task.res$seed==0 & task.res$model=="PreMode"],
                        " | ",
                        task.res$nlof.train[task.res$seed==0 & task.res$model=="PreMode"])
  task.plots <- list()
  p <- ggplot(task.res, aes(x=subset, y=auc, col=model)) + 
    geom_point() + 
    geom_errorbar(aes(ymin=auc-auc.se, ymax=auc+auc.se), width=.4) +
    # geom_line(aes(y=zero.shot), linetype="dotted") + 
    geom_line() + 
    scale_y_continuous(breaks=seq(0.4, 1, 0.2), limits = c(0.4, 1.0)) +
    scale_x_continuous(breaks=c(1, 2, 4, 6, 8),
                       labels=paste0(data.points,
                                     c(" (10%)", " (20%)", " (40%)", " (60%)", " (80%)"))) +
    ylab('Spearman rho') +
    theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    ggtitle(paste0(task)) + ggeasy::easy_center_title() + xlab("training data size (%)")
  plots[[i]] <- p
}
library(patchwork)
p <- ggarrange(plots[[6]], plots[[5]], plots[[3]], 
               plots[[2]], plots[[8]], plots[[7]], 
               plots[[9]], plots[[1]], plots[[4]], 
               ncol=3, nrow=3, common.legend = TRUE, legend="bottom")

# aggregate across models
uniq.model.result.plot <- uniq.result.plot[!duplicated(uniq.result.plot[,c('model', "subset")]),]
for (i in 1:dim(uniq.model.result.plot)[1]) {
  aucs <- uniq.result.plot$auc[uniq.result.plot$model == uniq.model.result.plot$model[i] &
                                 uniq.result.plot$subset == uniq.model.result.plot$subset[i]]
  auc.ses <- uniq.result.plot$auc.se[uniq.result.plot$model == uniq.model.result.plot$model[i] &
                                       uniq.result.plot$subset == uniq.model.result.plot$subset[i]]
  model.gene.names <- gsub(":.*", "", uniq.result.plot$gene[uniq.result.plot$model == uniq.model.result.plot$model[i] &
                                                        uniq.result.plot$subset == uniq.model.result.plot$subset[i]])
  subsets <- uniq.result.plot$subset[uniq.result.plot$model == uniq.model.result.plot$model[i] &
                                       uniq.result.plot$subset == uniq.model.result.plot$subset[i]]
  # get data set sizes
  ngof <- summary.df$ngof.train[summary.df$seed==0 & 
                                  summary.df$model=="PreMode" & 
                                  summary.df$subset == uniq.model.result.plot$subset[i]]
  nlof <- summary.df$nlof.train[summary.df$seed==0 & 
                                  summary.df$model=="PreMode" & 
                                  summary.df$subset == uniq.model.result.plot$subset[i]]
  data.points <- 1 / (1/ngof + 1/nlof)
  gene.ids <- genes[match(model.gene.names, gene.names)]
  # use harmonic prior of data points
  uniq.model.result.plot$auc[i] <- sum(aucs * data.points, na.rm = T) / sum(data.points)
  uniq.model.result.plot$auc.se[i] <- sum(auc.ses * data.points, na.rm = T) / sum(data.points)
}
p <- ggplot(uniq.model.result.plot, aes(x=subset, y=auc, col=model)) +
  geom_point() +
  geom_errorbar(aes(ymin=auc-auc.se, ymax=auc+auc.se), width=.2) +
  geom_line() + 
  scale_y_continuous(breaks=seq(0.4, 1, 0.2), limits = c(0.4, 1.0)) +
  scale_x_continuous(breaks=c(1, 2, 4, 6, 8),
                     labels=paste0(c(" (10%)", " (20%)", " (40%)", " (60%)", " (80%)"))) +
  ylab('AUC') +
  theme_bw() +
  theme(axis.text.x = element_text(angle=60, vjust = 1, hjust = 1), 
        text = element_text(size = 16),
        plot.title = element_text(size=15),
        legend.text = element_text(size=10),
        axis.title.x = element_text(size=12),
        legend.position="bottom", 
        legend.direction="horizontal") +
  ggtitle("Weighted Average of Model AUC\non subsample of training") +
  ggeasy::easy_center_title() + xlab("training data size (% of full G/LoF dataset)")
ggsave('figs/fig.5e.pdf', p, width = 4, height = 5)