set <- as.numeric(commandArgs(TRUE)[1]) domice <- as.numeric(commandArgs(TRUE)[2]) Pgenes <- as.numeric(commandArgs(TRUE)[3]) unkpct <- as.numeric(commandArgs(TRUE)[4]) library(naturalsort) library(cluster) library(Rcpp) library(e1071) library(nnet) library(randomForest) library(missForest) library(class) library(CALIBERrfimpute) ## source("~/Documents/mnem/R/mnems.r"); source("~/Documents/mnem/R/mnems_low.r"); sourceCpp("~/Documents/mnem/src/mm.cpp"); source("~/Documents/nempi/R/nempi_main.r"); source("~/Documents/nempi/R/nempi_low.r"); path <- "~/Mount/Eulershare/perturbseq2/"; set <- 3; domice <- 0; Pgenes <- 10 ## uncomment for leo/euler: source("mnems.r") source("mnems_low.r") ## sourceCpp("mm.cpp") sourceCpp(code=readChar("mm.cpp", file.info("mm.cpp")$size)) source("nempi_main.r") source("nempi_low.r") path <- "/cluster/work/bewi/members/mpirkl/perturbseq2/" acc <- function(a,b) { gamsave <- a gamtmp2 <- b auc <- roc <- 0 ppv <- rec <- spec <- NULL for (cut in c(2,seq(1,0, length.out = 100),-1)) { gamtmp <- apply(gamsave, 2, function(x) { y <- x*0 y[which(x > cut)] <- 1 return(y) }) tp <- sum(gamtmp == 1 & gamtmp2 == 1) fp <- sum(gamtmp == 1 & gamtmp2 == 0) tn <- sum(gamtmp == 0 & gamtmp2 == 0) fn <- sum(gamtmp == 0 & gamtmp2 == 1) ppvtmp <- tp/(tp+fp) if (is.na(ppvtmp)) { ppvtmp <- 0.5 } rectmp <- tp/(tp+fn) spectmp <- 1-tn/(tn+fp) if (length(ppv) > 0) { auc <- auc + (rectmp-rec[length(rec)])*(ppvtmp+ppv[length(ppv)])/2 roc <- roc + (spectmp-spec[length(spec)])*(rectmp+rec[length(rec)])/2 } ppv <- c(ppv, ppvtmp) rec <- c(rec, rectmp) spec <- c(spec, spectmp) } return(list(auc=auc,roc=roc,ppv=ppv,rec=rec,spec=spec)) } aucs <- function(a,b) { auc <- roc <- 0 ppv <- rec <- spec <- NULL for (cut in c(2,seq(1,0, length.out = 100),-1)) { tmp <- a*0 tmp[which(a > cut)] <- 1 tp <- sum(tmp == 1 & b == 1) fp <- sum(tmp == 1 & b == 0) tn <- sum(tmp == 0 & b == 0) fn <- sum(tmp == 0 & b == 1) ppvtmp <- tp/(tp+fp) if (is.na(ppvtmp)) { ppvtmp <- 0.5 } rectmp <- tp/(tp+fn) spectmp <- 1-tn/(tn+fp) if (length(ppv) > 0) { auc <- auc + (rectmp-rec[length(rec)])*(ppvtmp+ppv[length(ppv)])/2 roc <- roc + (spectmp-spec[length(spec)])*(rectmp+rec[length(rec)])/2 } ppv <- c(ppv, ppvtmp) rec <- c(rec, rectmp) spec <- c(spec, spectmp) } return(list(auc=auc,roc=roc,ppv=ppv,rec=rec,spec=spec)) } lods <- readRDS(paste0(path, "L_linnorm_", set, "_sc.rds")) # lods <- lods.bckp colnames(lods) <- gsub("_[0-9].*","",colnames(lods)) if (set %in% c(4,5)) { lods <- lods[,which(!(colnames(lods) == "Unknwn"))] search <- "greedy" if (set == 4) { Sgenes <- sort(sample(getSgenes(lods),Pgenes)) Gamma0 <- getGamma(lods) Gamma0 <- Gamma0[which(rownames(Gamma0) %in% Sgenes),] colnames(lods) <- apply(Gamma0,2,function(x) { y <- paste(sort(rownames(Gamma0)[which(x==1)]),collapse="_") return(y) }) lods <- lods[,which(!(colnames(lods) == ""))] } } else { search <- "exhaustive" } fres <- nem(lods,search=search) phi <- fres$adj n <- ncol(phi) Gamma <- t(mytc(phi))%*%getGamma(lods) Gamma[which(Gamma > 1)] <- 1 prevalence <- sum(Gamma==1)/length(Gamma) phi_gtn_dens <- (sum(mytc(phi)==1)-n)/((n*(n-1))/2) if (set == 4) { if (domice) { file <- paste0("nempi_epistasis_", set, "_", domice, "_", Pgenes, "_", unkpct, ".csv") } else { file <- paste0("nempi_epistasis_", set, "_", Pgenes, "_", unkpct, ".csv") } } else { if (domice) { file <- paste0("nempi_epistasis_", set, "_", domice, "_", unkpct, ".csv") } else { file <- paste0("nempi_epistasis_", set, "_", unkpct, ".csv") } } if (!file.exists(file)) { if (domice) { write.table(matrix(c("nempi","svm","randomForest","neuralNet","knn","missForest","mice","random","phi_acc","phi_gtn_dens","nempi","svm","randomForest","neuralNet","knn","missForest","mice","random2","prevelance"),1),file=file,sep=",",col.names=FALSE,row.names=FALSE) } else { write.table(matrix(c("nempi","svm","randomForest","neuralNet","knn","missForest","random","phi_acc","phi_gtn_dens","nempi","svm","randomForest","neuralNet","knn","missForest","random2","prevelance"),1),file=file,sep=",",col.names=FALSE,row.names=FALSE) } } lods.sub <- lods unknowns <- sample(1:ncol(lods),floor(unkpct*ncol(lods))) colnames(lods.sub)[unknowns] <- "" ## nempi nres <- nempi(lods.sub,search=search) n <- nrow(phi) phiacc <- 1-sum(abs(mytc(phi)-mytc(nres$res$adj)))/(n*(n-1)) Gamma2 <- t(mytc(nres$res$adj))%*%nres$Gamma nacc <- acc(Gamma2,Gamma) nacc2 <- acc(nres$Gamma,getGamma(lods)) ## svm svmres <- classpi(lods.sub) svmacc <- acc(svmres$Gamma,Gamma) svmacc2 <- acc(svmres$Gamma,getGamma(lods)) ## random forest rfres <- classpi(lods.sub,method="randomForest") rfacc <- acc(rfres$Gamma,Gamma) rfacc2 <- acc(rfres$Gamma,getGamma(lods)) ## neural net nnres <- classpi(lods.sub,method="nnet",size=2) nnacc <- acc(nnres$Gamma,Gamma) nnacc2 <- acc(nnres$Gamma,getGamma(lods)) ## knn train <- t(lods.sub[, which(colnames(lods.sub) != "")]) test <- t(lods.sub[, which(colnames(lods.sub) == "")]) cl <- colnames(lods.sub)[which(colnames(lods.sub) != "")] tmp <- knn(train, test, cl, prob=TRUE) lods.sub2 <- lods.sub colnames(lods.sub2)[which(colnames(lods.sub2) %in% "")] <- as.character(tmp) knnres <- list() knnres$Gamma <- getGamma(lods.sub2) knnres$Gamma <- apply(knnres$Gamma, 2, function(x) return(x/sum(x))) knnacc <- acc(knnres$Gamma,Gamma) knnacc2 <- acc(knnres$Gamma,getGamma(lods)) ## missForest mfdata <- cbind(as.data.frame(t(lods.sub)), factor(colnames(lods.sub))) mfdata[which(mfdata == "", arr.ind = TRUE)] <- NA sink("NUL") mfimp <- missForest(mfdata) sink() lods.sub2 <- lods.sub colnames(lods.sub2) <- mfimp$ximp[, ncol(mfimp$ximp)] mfres <- list() mfres$Gamma <- getGamma(lods.sub2) mfres$Gamma <- apply(mfres$Gamma, 2, function(x) return(x/sum(x))) mfacc <- acc(mfres$Gamma,Gamma) mfacc2 <- acc(mfres$Gamma,getGamma(lods)) ## mice: if (domice) { micedata <- mfdata colnames(micedata) <- paste0(LETTERS[1:ncol(micedata)], 1:ncol(micedata)) sink("NUL") miceres <- mice(micedata, method = c(rep('', ncol(micedata)-1), 'rfcat'), m = 1, maxit = 1) sink() lods.sub2 <- lods.sub colnames(lods.sub2)[which(colnames(lods.sub2) %in% "")] <- as.character(miceres$imp[[length(miceres$imp)]][, 1]) miceres <- list() miceres$Gamma <- getGamma(lods.sub2) miceres$Gamma <- apply(miceres$Gamma, 2, function(x) return(x/sum(x))) miceacc <- acc(miceres$Gamma,Gamma) miceacc2 <- acc(miceres$Gamma,getGamma(lods)) } ## random: rand <- getGamma(lods.sub) rand[,unknowns] <- runif(nrow(rand)*length(unknowns),0,1) # sample(c(0,1),nrow(rand)*length(unknowns),replace=TRUE) random <- acc(rand,Gamma) random2 <- acc(rand,getGamma(lods)) if (domice) { write.table(matrix(c(nacc$auc,svmacc$auc,rfacc$auc,nnacc$auc,knnacc$auc,mfacc$auc,miceres$auc,random$auc,phiacc,phi_gtn_dens,nacc2$auc,svmacc2$auc,rfacc2$auc,nnacc2$auc,knnacc2$auc,mfacc2$auc,miceres2$auc,random2$auc,prevalence),1),file=file,append=TRUE,sep=",",col.names=FALSE,row.names=FALSE) } else { write.table(matrix(c(nacc$auc,svmacc$auc,rfacc$auc,nnacc$auc,knnacc$auc,mfacc$auc,random$auc,phiacc,phi_gtn_dens,nacc2$auc,svmacc2$auc,rfacc2$auc,nnacc2$auc,knnacc2$auc,mfacc2$auc,random2$auc,prevalence),1),file=file,append=TRUE,sep=",",col.names=FALSE,row.names=FALSE) } stop("success") ## cluster: system("scp nempi/R/nempi_main.r euler.ethz.ch:") system("scp nempi/R/nempi_low.r euler.ethz.ch:") system("scp mnem/R/mnems_low.r euler.ethz.ch:") system("scp mnem/R/mnems.r euler.ethz.ch:") system("scp testing/nempi/other/perturbseq.r euler.ethz.ch:") rm error.txt rm output.txt rm .RData queue=4 ram=8000 set=4 domice=0 Pgenes=10 unkpct=0.1 if [ ${set} == '4' ] && [ ${Pgenes} == '15' ] then queue=24 fi if [ ${set} == '4' ] && [ ${Pgenes} == '10' ] && [ ${unkpct} == '0.1' ] then queue=24 fi bsub -M ${ram} -q normal.${queue}h -n 1 -e error.txt -o output.txt -R "rusage[mem=${ram}]" "R/bin/R --vanilla --silent --no-save --args '${set}' '${domice}' '${Pgenes}' '${unkpct}' < perturbseq.r" for i in {2..71}; do bsub -M ${ram} -q normal.${queue}h -n 1 -e error.txt -o output.txt -R "rusage[mem=${ram}]" "R/bin/R --vanilla --silent --no-save --args '${set}' '${domice}' '${Pgenes}' '${unkpct}' < perturbseq.r" done for i in {145596019..145596204}; do bkill ${i} done ## figures: supp <- 1 if (supp) { layout.mat <- matrix(c(rep(rep(1:3,each=1),2),rep(rep(4:6,each=1),2), rep(rep(7:9,each=1),2),rep(rep(10:12,each=1),2),13:15),9,byrow=1) height <- 12 width <- 8 } else { layout.mat <- matrix(c(rep(rep(1:3,each=1),2),rep(rep(4:6,each=1),2),7:9),5,byrow=1) height <- 7 width <- 8 } setEPS() postscript("temp.eps", height = height, width = width) par(mar=c(3.85,4,4,1),oma=c(0,0,0,0)) layout(layout.mat) path <- "~/Mount/Euler/" cols <- c("red", "blue", "darkgreen", "brown", "orange", "turquoise", "grey"); data.names <- c("Epistasis 1","Epistasis 2","Epistasis 3","Main (10 P-genes)","Pilot","Main (15 P-genes)") phiacc <- array(NA,c(100,6,3),list(rows=1:100,data=data.names,unkpct=c(0.1,0.5,0.9))) ylab <- "area under the precision-recall curve" for (i in c(5,4,6,1,2,3)) { count <- 0 for (unkpct in c(0.1,0.5,0.9)) { count <- count + 1 if (i == 4) { ylim.min <- 0.2 res <- read.csv(paste0(path, "nempi_epistasis_4_10_", unkpct, ".csv")) } else if (i == 6) { res <- read.csv(paste0(path, "nempi_epistasis_4_15_", unkpct, ".csv")) } else { if (i == 5) { ylim.min <- 0.4 } else { ylim.min <- 0.75 } res <- read.csv(paste0(path, "nempi_epistasis_", i, "_", unkpct, ".csv")) } if (length(res$phi_acc)>0) { phiacc[1:100,i,count] <- res$phi_acc[1:100] } print(dim(res)) res2 <- res[,c(1:7)] if ((supp & unkpct != 0.5) | (!supp & unkpct == 0.5)) { mnem:::myboxplot(res2,col = cols,ylim=c(ylim.min,1),main=paste0(data.names[i], "\nunobserved: ", unkpct),ylab=ylab,box=1,scatter=1,dens=0,xaxt = "n",border = cols,medcol="black",cex.main=1.5,cex.lab=1.25) } } } if (supp) { cex.leg <- 2 } else { cex.leg <- 1.5 } par(mar=rep(0, 4)) plot.new() legend("topleft",legend=c(expression(NEM~pi), "svm", "neural net"),col=c("red", "blue", "darkgreen"),fill=c("red", "blue", "darkgreen"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent") plot.new() legend("topleft",legend=c("random forest", "missForest","knn"),col=c("brown", "orange", "pink"),fill=c("brown", "orange", "turquoise"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent") plot.new() legend("topleft",legend=c("random"),col=c("grey"),fill=c("grey"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent") dev.off() layout.mat <- matrix(c(rep(rep(1:3,each=1),2),rep(rep(4:6,each=1),2), rep(rep(7:9,each=1),2),rep(rep(10:12,each=1),2),13:15),9,byrow=1) height <- 8 width <- 8 setEPS() postscript("temp.eps", height = height, width = width) par(mar=c(3.85,4,4,1),oma=c(0,0,0,0)) layout(layout.mat) path <- "~/Mount/Euler/" cols <- c("red", "blue", "darkgreen", "brown", "orange", "turquoise", "grey"); data.names <- c("Epistasis 1","Epistasis 2","Epistasis 3","Main (10 P-genes)","Pilot","Main (15 P-genes)") ylab <- "area under the precision-recall curve" for (i in c(4,6)) { count <- 0 for (unkpct in c(0.1,0.5,0.9)) { count <- count + 1 if (i == 4) { res <- read.csv(paste0(path, "nempi_epistasis_4_10_", unkpct, ".csv")) } else if (i == 6) { res <- read.csv(paste0(path, "nempi_epistasis_4_15_", unkpct, ".csv")) } print(dim(res)) res2 <- res[,c(1:7)] res3 <- res2[which(res[,9]>=median(res[,9])),] mnem:::myboxplot(res3,col = cols,ylim=c(0.2,1),main=paste0(data.names[i], "\nunobserved: ", unkpct),ylab=ylab,box=1,scatter=1,dens=0,xaxt = "n",border = cols,medcol="black",xlab=expression(dense ~ phi),cex.main=1.5,cex.lab=1.25) res3 <- res2[which(res[,9]<median(res[,9])),] mnem:::myboxplot(res3,col = cols,ylim=c(0.2,1),main=paste0(data.names[i], "\nunobserved: ", unkpct),ylab=ylab,box=1,scatter=1,dens=0,xaxt = "n",border = cols,medcol="black",xlab=expression(sparse ~ phi),cex.main=1.5,cex.lab=1.25) } } cex.leg <- 2 par(mar=rep(0, 4)) plot.new() legend("topleft",legend=c(expression(NEM~pi), "svm", "neural net"),col=c("red", "blue", "darkgreen"),fill=c("red", "blue", "darkgreen"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent") plot.new() legend("topleft",legend=c("random forest", "missForest","knn"),col=c("brown", "orange", "pink"),fill=c("brown", "orange", "turquoise"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent") plot.new() legend("topleft",legend=c("random"),col=c("grey"),fill=c("grey"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent") dev.off() pdf("FigS_crispr_phi.pdf",width=9,height=3) unkpcts <- c(0.1,0.5,0.9) par(mfrow=c(1,3)) for (i in 1:3) { #mnem:::myboxplot(phiacc[,c(1:3,5,4,6),i],col = cols[1],ylim=c(0,1),main=expression(accuracy ~ of ~ phi),ylab="normalised hamming distance",box=1,scatter=1,dens=0,xaxt = "n",border = cols[1],medcol="black") mnem:::myboxplot(phiacc[,c(1:3,5,4,6),i],col = cols[1],ylim=c(0,1),main=bquote(accuracy ~ of ~ phi ~ (unknowns: ~ .(unkpcts[i]))),ylab="normalised hamming distance",box=1,scatter=1,dens=0,xaxt = "n",border = cols[1],medcol="black",cex.main=1.5,cex.lab=1.25) axis(1,1:6,c(expression(Epistasis ~ 1),"Epistasis 2\n","Epistasis 3","Pilot\n","Main (10)","Main (15)\n"),padj=1,las=1) } dev.off() ## old: pdf("temp.pdf",width=10,height=18) par(mfrow=c(6,5)) phiacc <- list() count <- 0 for (unkpct in c(0.1,0.5,0.9)) { count <- count + 1 path <- "~/Mount/Euler/" cols <- c("red", "blue", "darkgreen", "brown", "orange", "turquoise", "grey"); data.names <- c("Epistasis 1","Epistasis 2","Epistasis 3","Main (10 P-genes)","Pilot","Main (15 P-genes)") ylab <- "area under the precision-recall curve" for (i in c(1,2,4,3,5,6)) { if (i == 4) { res <- read.csv(paste0(path, "nempi_epistasis_4_10_", unkpct, ".csv")) } else if (i == 6) { res <- read.csv(paste0(path, "nempi_epistasis_4_15_", unkpct, ".csv")) } else { res <- read.csv(paste0(path, "nempi_epistasis_", i, "_", unkpct, ".csv")) } if (i == 1) { phiacc[[count]] <- res$phi_acc } else { phiacc[[count]] <- cbind(phiacc[[count]],res$phi_acc) } print(dim(res)) res2 <- res[,c(1:7)] mnem:::myboxplot(res2,col = cols,ylim=c(0,1),main=data.names[i],ylab=ylab,box=1,scatter=1,dens=0,xaxt = "n",border = cols,medcol="black") if (i %in% c(4,6)) { res3 <- res2[which(res[,9]>=median(res[,9])),] mnem:::myboxplot(res3,col = cols,ylim=c(0,1),main=data.names[i],ylab=ylab,box=1,scatter=1,dens=0,xaxt = "n",border = cols,medcol="black",xlab=expression(dense ~ phi)) res3 <- res2[which(res[,9]<median(res[,9])),] mnem:::myboxplot(res3,col = cols,ylim=c(0,1),main=data.names[i],ylab=ylab,box=1,scatter=1,dens=0,xaxt = "n",border = cols,medcol="black",xlab=expression(sparse ~ phi)) } } } dev.off() ## mnem: library(Rgraphviz) path <- "~/Mount/Eulershare/perturbseq2/" lods <- readRDS(paste0(path,"L_linnorm_1_sc.rds")) colnames(lods) <- gsub("_[0-9].*","",colnames(lods)) mres <- mnem(lods,search="exhaustive",k=3,complete=TRUE) par(mfrow=c(2,3)) mnem::plotConvergence(mres) plot(mres) mkres <- mnemk(lods,search="exhaustive",complete=TRUE,start=10) par(mfrow=c(1,2)) plot(mkres$lls,type="b") plot(mkres$ics,type="b") par(mfrow=c(2,2)) mnem::plotConvergence(mkres$best) plot(mkres$best,legend=TRUE,showdata=TRUE) pca <- prcomp(lods) cols <- numeric(ncol(lods)) for (i in 1:length(unique(colnames(lods)))) { cols[which(colnames(lods) %in% unique(colnames(lods))[i])] <- i } plot(pca$rotation[,1:2], col = cols)