File size: 4,834 Bytes
library(ggplot2)
task.dic <- list("Stab"=c("score.1"="stability.1", "score.2"="stability.2"))
py.path <- '/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python'
alphabet_premode <- c('L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
              'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C')
genes <- c("Stab")
scores <- c('AlphaMissense', 'gMVP', 'PrimateAI', 'REVEL', 'ESM1b.LLR', 'FoldXddG')
models <- c('PreMode/', 'ESM.SLP/')
models.dic <- c('PreMode/'='PreMode', "ESM.SLP/"='ESM+SLP')
# add baseline AUC
# esm alphabets
source('./AUROC.R')
alphabet <- c('<cls>', '<pad>', '<eos>', '<unk>',
              'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
              'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C',
              'X', 'B', 'U', 'Z', 'O', '.', '-',
              '<null_1>', '<mask>')
# first plot PreMode pretrained auc vs other scores
result.df <- NULL
scores <- c(scores, models)
for (i in 1:length(genes)) {
  for (fold in 0:4) {
  dms.df <- read.csv(paste0('PreMode/', genes[i], '/',
                            '/test.fold.', fold, '.annotated.csv'))
  # calculate R2
  stab.r <- NULL
  other.r <- NULL
  for (score in scores) {
    if (score %in% models) {
      dms.df <- read.csv(paste0(score, genes[i], '/',
                                '/testing.fold.', fold, '.csv'))
      all.r <- abs(plot.R2(dms.df[,names(task.dic[[genes[i]]])],
                           dms.df[,paste0('logits.', 1:length(task.dic[[genes[i]]])-1)])$R2)
    } else {
      all.r <- abs(plot.R2(dms.df[,names(task.dic[[genes[i]]])],
                           dms.df[,rep(score, length(task.dic[[genes[i]]]))])$R2)
    }
    stab.r <- c(stab.r, mean(all.r))
  }
  model.names <- scores
  model.names[model.names %in% models] <- models.dic[model.names[model.names %in% models]]
  result.df <- rbind(result.df,
                     data.frame(model=model.names,
                                HGNC=genes[i],
                                fold=fold,
                                npoints=dim(dms.df)[1],
                                stab.rho=stab.r))
  # add biochem properties
  # write train and test emb to files
  dms.train.df <- read.csv(paste0('PreMode/', genes[i], '/',
                                  '/train.fold.', fold, '.annotated.csv'))
  dms.df <- read.csv(paste0('PreMode/', genes[i], '/',
                            '/test.fold.', fold, '.annotated.csv'))
  dms.train.df <- prepare.unique.id(dms.train.df)
  dms.df <- prepare.unique.id(dms.df)
  # get train and test biochemical
  gene.train.biochem <- prepare.biochemical(dms.train.df)
  gene.test.biochem <- prepare.biochemical(dms.df)
  # write train and test emb to files
  train.label.file <- tempfile()
  test.label.file <- tempfile()
  train.biochem.file <- tempfile()
  test.biochem.file <- tempfile()
  write.csv(dms.train.df, file = train.label.file)
  write.csv(dms.df, file = test.label.file)
  write.csv(gene.train.biochem, file = train.biochem.file)
  write.csv(gene.test.biochem, file = test.biochem.file)
  res <- system(paste0(py.path, ' ', 
                       'elastic.net.dms.py ', 
                       train.biochem.file, ' ',
                       train.label.file, ' ',
                       test.biochem.file, ' ', 
                       test.label.file), intern = T)
  baseline.auc.3 <- list(R2=as.numeric(as.data.frame(strsplit(res, split = '='))[2,]))
  result.df <- rbind(result.df,
                     data.frame(model=c('Elastic Net'),
                                HGNC='Stab',
                                fold=fold,
                                npoints=dim(dms.df)[1],
                                stab.rho=c(mean(baseline.auc.3$R2))))
  }
}
write.csv(result.df, './figs/fig.sup.6.csv')
# plot the task weighted averages as well as task size weighted error bars
uniq.result.plot <- result.df[result.df$fold==0,]
for (i in 1:dim(uniq.result.plot)[1]) {
  uniq.result.plot$stab.rho[i] = mean(result.df$stab.rho[result.df$model==uniq.result.plot$model[i] & 
                                                 result.df$HGNC==uniq.result.plot$HGNC[i]], na.rm=T)
  uniq.result.plot$stab.rho.sd[i] = sd(result.df$stab.rho[result.df$model==uniq.result.plot$model[i] & 
                                                  result.df$HGNC==uniq.result.plot$HGNC[i]], na.rm=T)
  
}
p <- ggplot(uniq.result.plot, aes(x=stab.rho, y=model)) + 
  geom_point() +
  # geom_errorbar(aes(ymin=other.rho-other.rho.sd, ymax=other.rho+other.rho.sd)) +
  geom_errorbarh(aes(xmin=stab.rho-stab.rho.sd, xmax=stab.rho+stab.rho.sd), height=.2) +
  # geom_abline(slope = 1, intercept = 0, linetype = "dashed", alpha=0.2) +
  scale_shape_manual(values = 11:18) +
  ggtitle("Spearman Correlation (5 Fold testing)") +
  theme_bw() + ggeasy::easy_center_title()
ggsave('figs/fig.sup.6.pdf', p, height = 4, width = 5)