load("~/Documents/nempi_backup/backup3/TCGA-BRCA_nempi.rda") M <- Pmut patients.mut <- unlist(lapply(colnames(M), function(x) { y <- paste(unlist(strsplit(x, "\\-"))[1:3], collapse = "-") return(y) })) snv_matrix <- M D <- readRDS("~/Documents/mutclust/TCGA-BRCA_counts.rds") DT <- D$T DN <- D$N patients.exprs <- unlist(lapply(colnames(DT), function(x) { y <- paste(unlist(strsplit(x, "\\-"))[1:3], collapse = "-") return(y) })) # colnames(DN) <- unlist(lapply(colnames(DN), function(x) { # y <- paste(unlist(strsplit(x, "\\-"))[1:3], collapse = "-") # return(y) # })) DTN <- cbind(DT, DN) samples <- colnames(DT)[which(patients.exprs %in% patients.mut)] expression_matrix <- DTN sample_origins <- c(rep("tumor",ncol(DT)),rep("normal",ncol(DN))) colnames(expression_matrix) <- unlist(lapply(colnames(expression_matrix), function(x) { y <- paste(unlist(strsplit(x, "\\-"))[1:4], collapse = "-") y <- unlist(strsplit(y, "")) y <- paste(y[-length(y)], collapse = "") return(y) })) library('biomaRt') mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) genesymbols <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"), values=rownames(expression_matrix), mart= mart) rownames(expression_matrix) <- genesymbols[match(rownames(expression_matrix), genesymbols[, 1]), 2] library(PRODIGY) # Load STRING network data data(STRING_network) network = STRING_network # Get differentially expressed genes (DEGs) for all samples expression_matrix = expression_matrix[which(rownames(expression_matrix) %in% unique(c(network[,1],network[,2]))),] library(DESeq2) DEGs = get_DEGs(expression_matrix,samples,sample_origins=sample_origins,beta=2,gamma=0.05) names(DEGs) <- unlist(lapply(names(DEGs), function(x) { y <- paste(unlist(strsplit(x, "\\-"))[1:4], collapse = "-") y <- unlist(strsplit(y, "")) y <- paste(y[-length(y)], collapse = "") return(y) })) # Run PRODIGY samples <- colnames(snv_matrix) all_patients_scores = PRODIGY_cohort(snv_matrix,expression_matrix,network=network,samples=samples,DEGs=DEGs,alpha=0.05,pathwayDB="reactome",num_of_cores=1,sample_origins=sample_origins,write_results = TRUE, results_folder = "prodigy/",beta=2,gamma=0.05,delta=0.05) tmp <- PRODIGY(snv_matrix,expression_matrix,network, sample=names(DEGs)[1],DEGs[[1]], sample_origins=sample_origins, write_results = TRUE, results_folder = "prodigy/") # Get driver gene rankings for all samples results = analyze_PRODIGY_results(all_patients_scores) ## try dawnrank: library(DawnRank) DT2 <- DT colnames(DT2) <- unlist(lapply(colnames(DT2), function(x) { y <- paste(unlist(strsplit(x, "\\-"))[1:4], collapse = "-") y <- unlist(strsplit(y, "")) y <- paste(y[-length(y)], collapse = "") return(y) })) DT2 <- DT2[,which(colnames(DT2) %in% colnames(M))] M <- M[,which(colnames(M) %in% colnames(DT2))] DT2 <- DT2[,colnames(M)] DT2.log2 <- log2(DT2 + 1) DN.log2 <- log2(DN + 1) file <- "~/Documents/testing/nempi/other/nempi_dawnrank_data.rds" if (file.exists(file)) { normalizedDawn <- readRDS(file) } else { normalizedDawn <- DawnNormalize(tumorMat = DT2.log2, normalMat = DN.log2) saveRDS(normalizedDawn, file = file) } n <- length(unique(as.vector(network[,1:2]))) pathway <- matrix(0, n, n) rownames(pathway) <- colnames(pathway) <- sort(unique(as.vector(network[,1:2]))) a <- match(network[,1],rownames(pathway)) b <- match(network[,2],rownames(pathway)) pathway[cbind(a,b)] <- 1 library('biomaRt') mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) genesymbols <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"), values=rownames(normalizedDawn), mart= mart) rownames(normalizedDawn) <- genesymbols[match(rownames(normalizedDawn), genesymbols[, 1]), 2] tmp <- which(rownames(M) %in% rownames(normalizedDawn)) M <- M[tmp,] tmp <- which(rownames(normalizedDawn) %in% rownames(M)) normalizedDawn <- normalizedDawn[tmp,] normalizedDawn <- rowsum(normalizedDawn, rownames(normalizedDawn))/as.numeric(table(rownames(normalizedDawn))) tmp <- rownames(M)[which(!(rownames(M) %in% rownames(pathway)))] pathway2 <- matrix(0, length(tmp)+nrow(pathway),length(tmp)+ncol(pathway)) pathway2[1:nrow(pathway),1:ncol(pathway)] <- pathway rownames(pathway2) <- colnames(pathway2) <- c(rownames(pathway),tmp) tmp <- which(colnames(M) %in% colnames(normalizedDawn)) M <- M[,tmp] M <- M[rownames(M),colnames(M)] normalizedDawn <- normalizedDawn[rownames(M),colnames(M)] pathway2 <- pathway2[rownames(M),rownames(M)] my.genes <- c("TBX3","MAP2K4","GATA3","CBFB","NCOR1","PTPRD","GPS2","CDKN1B") ## load("~/Documents/testing/nempi/other/Dawnrank_temp.rda") file <- "~/Documents/testing/nempi/other/nempi_dawnrank.rds" if (!file.exists(file)) { # get the DawnRank Score Get some coffee, this might take a while! dawnRankScore <- DawnRank(adjMatrix = pathway2, mutationMatrix = M, expressionMatrix = normalizedDawn, mu = 3, parallel = NULL) saveRDS(dawnRankScore,file=file) } else { dawnRankScore <- readRDS(file) } rank.mat <- matrix(NA, length(my.genes), ncol(M)) rownames(rank.mat) <- my.genes colnames(rank.mat) <- colnames(M) for (gene in 1:length(my.genes)) { idx <- which(rownames(dawnRankScore[[2]]) == my.genes[gene]) if (length(idx) > 0) { rank.mat[gene,] <- dawnRankScore[[2]][idx,] } } Msub <- M[which(rownames(M) %in% my.genes),] Msub <- Msub[rownames(rank.mat),] rank.mat.sub <- rank.mat[,which(apply(Msub, 2, sum) != 0)] Msub <- Msub[,which(apply(Msub, 2, sum) != 0)] cut <- 90 TP <- sum(rank.mat.sub > cut & Msub == 1) FP <- sum(rank.mat.sub > cut & Msub == 0) TN <- sum(rank.mat.sub <= cut & Msub == 0) FN <- sum(rank.mat.sub <= cut & Msub == 1) TP/(TP+FP) TP/(TP+FN) gamsave <- rank.mat.sub/100 Gamma <- Msub auc <- roc <- 0 ppv <- rec <- spec <- NULL for (cut in c(2,seq(1,0, length.out = 100),-1)) { gamtmp <- apply(gamsave, 2, function(x) { y <- x*0 y[which(x > cut)] <- 1 return(y) }) gamtmp2 <- Gamma tp <- sum(gamtmp == 1 & gamtmp2 == 1) fp <- sum(gamtmp == 1 & gamtmp2 == 0) tn <- sum(gamtmp == 0 & gamtmp2 == 0) fn <- sum(gamtmp == 0 & gamtmp2 == 1) ppvtmp <- tp/(tp+fp) if (is.na(ppvtmp)) { ppvtmp <- 0.5 } rectmp <- tp/(tp+fn) spectmp <- 1-tn/(tn+fp) if (length(ppv) > 0) { auc <- auc + (rectmp-rec[length(rec)])*(ppvtmp+ppv[length(ppv)])/2 roc <- roc + (spectmp-spec[length(spec)])*(rectmp+rec[length(rec)])/2 } ppv <- c(ppv, ppvtmp) rec <- c(rec, rectmp) spec <- c(spec, spectmp) } par(mfrow=c(1,3)) plot(ppv) plot(rec) plot(spec) pdf("~/Documents/temp.pdf", width = 16, height = 8) tmp <- rank.mat.sub colnames(tmp) <- NULL csc <- numeric(ncol(tmp)) csc[which(apply(Msub, 2, sum) != 0)] <- 4 epiNEM::HeatmapOP(tmp/100, breaks=seq(0,1,length.out=100), col="RdBu", bordercol="transparent", colSideColors=csc) dev.off() ## check with mutation matrix: if (file.exists("nempi_dawnrank_agg.rds")) { aggregateDawnRankScore <- condorcetRanking(scoreMatrix =dawnRankScore[[2]],mutationMatrix = M,parallel = NULL) saveRDS(aggregateDawnRankScore,file="nempi_dawnrank_agg.rds") } else { aggregateDawnRankScore <- readRDS("nempi_dawnrank_agg.rds") } tmp <- aggregateDawnRankScore[[2]] tmp[which(names(tmp) %in% my.genes)] ## process mutations for 2020 plus (makes no sense?): mutSave <- mut for (i in 1:4) { mut[[i]] <- as.data.frame(mut[[i]]) } m2020 <- do.call("rbind",mut) write.table(m2020, file = "~/Documents/2020plus/data/brca.txt") blad <- read.delim("~/Documents/2020plus/data/bladder.txt") brca <- read.table("~/Documents/2020plus/data/brca.txt") colnames(brca)[which(colnames(brca) == "Gene")] <- "EnsembleId" colnames(brca)[which(colnames(brca) == "SYMBOL")] <- "Gene" colnames(brca)[which(colnames(brca) == "Tumor_Sample_Barcode")] <- "Tumor_Sample" colnames(brca)[which(colnames(brca) == "Tumor_Seq_Allele1")] <- "Tumor_Allele" colnames(brca)[which(colnames(brca) == "HGVSp")] <- "Protein_Change" colnames(brca)[which(colnames(brca) == "HGVSc")] <- "DNA_Change" brca <- data.frame(brca, Tumor_Type = rep("Breast Cancer", nrow(brca))) # "Symbol" "Tumor_Sample_Barcode" "?" "Chromosome" "Start_Position" "End_Position" "Variant_Classification" "Reference_Allele" "Allele/Tumor_Seq_Allele1/2!!!" "HGVSp/HGVSp_Short" "HGVSc" brca <- brca[,which(colnames(brca) %in% colnames(blad))] brca <- brca[,colnames(blad)] brca[,"Tumor_Sample"] <- unlist(lapply(brca[,"Tumor_Sample"], function(x) { y <- paste(unlist(strsplit(x,"-"))[1:3],collapse="-") return(y) })) brca$Start_Position <- as.numeric(brca$Start_Position) brca$End_Position <- as.numeric(brca$End_Position) ## brca_save <- brca tmp <- which(table(brca$Start_Position) >= 3) brca <- brca[which(brca$Start_Position %in% names(tmp)),] brca <- brca[!duplicated(brca$Start_Position),] write.table(brca, file = "~/Documents/2020plus/data/brca_reduced.txt", quote = FALSE, sep = "\t") ## 2020+ rm error.txt rm output.txt ram=32000 # data: 64gb, wilcox sc: 8gb for 32 nodes, edger: 64gb, voom: 64gb queue=24 cores=1 bsub -M ${ram} -q normal.${queue}h -n ${cores} -e error.txt -o output.txt -R "rusage[mem=${ram}]" 'snakemake -s Snakefile pretrained_predict -p --cores 1 \ --config mutations="data/brca.txt" output_dir="brca/" trained_classifier="data/2020plus_100k.Rdata"' ## read results: res2020 <- read.delim("~/Mount/Eulershare/2020plus/2020plus/brca/pretrained_output/results/r_random_forest_prediction.txt")