library(snowfall) library(TCGAbiolinks) if (is.null(types)) { types <- TCGAbiolinks:::getGDCprojects()$project_id ## special "FM-AD" dont <- paste0(unique(gsub("-.*", "", types)), "-") dont <- dont[-grep("TCGA", dont)] samplenr <- matrix(NA, length(types), 2) rownames(samplenr) <- types donr <- TRUE snrcount <- 0 } else { dont <- "AGCT" donr <- FALSE } sizemat <- matrix(0, 1, 2) colnames(sizemat) <- c("Tumor", "Normal") rownames(sizemat) <- "" path <- "mutclust/" for (type in types) { if (donr) { snrcount <- snrcount + 1 } if (length(grep(paste(dont, collapse = "|"), type)) > 0) { next() } print(type) if (file.exists(paste0(path, type, "_final.rda")) & !newmut & !newllr & !newsave) { load(paste0(path, type, "_final.rda")) } else { summ <- TCGAbiolinks:::getProjectSummary(type) library(SummarizedExperiment) ## get methylation: if (file.exists(paste0(path, type, "_meth.rda"))) { load(paste0(path, type, "_meth.rda")) } else { data <- GDCquery(project = paste(type, sep = ""), sample.type = "Primary Tumor", data.category = "DNA methylation", data.type = "Methylation beta value", legacy = TRUE ) GDCdownload(data) data <- GDCprepare(data, summarizedExperiment = 0) ## map methy sites to genes: library(methyAnalysis) if (is.data.frame(data)) { meth2 <- as.matrix(data[, -(1:3)]) rownames(meth2) <- gsub(";.*", "", data[, 1]) } else { meth <- data@rowRanges library(TxDb.Hsapiens.UCSC.hg19.knownGene) probe2gene <- annotateDMRInfo(meth, 'TxDb.Hsapiens.UCSC.hg19.knownGene') meth2 <- assay(data) rownames(meth2) <- probe2gene$sigDMRInfo@elementMetadata@listData$GeneSymbol } meth <- meth2 meth <- meth[which(apply(meth, 1, function(x) return(any(is.na(x)))) == FALSE), ] methm <- meth[which(duplicated(rownames(meth)) == FALSE), ] count <- 0 for (i in which(duplicated(rownames(meth)) == FALSE)) { count <- count + 1 methm[count, ] <- apply(meth[which(rownames(meth) %in% rownames(methm)[i]), , drop = FALSE], 2, median) } meth <- methm meth <- meth[order(rownames(meth)), order(colnames(meth))] colnames(meth) <- gsub("A$", "", lapply(strsplit(colnames(meth), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) })) methm <- meth[, which(duplicated(colnames(meth)) == FALSE)] for (i in which(duplicated(colnames(meth)) == TRUE)) { j <- which(colnames(methm) == colnames(meth)[i]) methm[, j] <- apply(meth[, which(colnames(meth) %in% colnames(methm)[i]), drop = FALSE], 2, median) } meth <- methm meth <- meth[order(rownames(meth)), order(colnames(meth))] save(meth, file = paste0(path, type, "_meth.rda")) } print("meth done") ## get copy number variation: if (file.exists(paste0(path, type, "_cnv.rda"))) { load(paste0(path, type, "_cnv.rda")) } else { data <- getGistic(gsub("TCGA-", "", type), type = "thresholded") ## data <- GDCquery(project = paste(type, sep = ""), ## sample.type = "Primary solid Tumor", ## data.category = "Copy Number Variation", ## data.type = "Copy Number Segment", ## ) ## GDCdownload(data) ## data <- GDCprepare(data) cnv <- data[, -(1:3)] cnv <- apply(cnv, c(1,2), as.numeric) rownames(cnv) <- data[, 1] colnames(cnv) <- gsub("A$", "", lapply(strsplit(colnames(cnv), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) })) cnv <- cnv[order(rownames(cnv)), order(colnames(cnv))] save(cnv, file = paste0(path, type, "_cnv.rda")) } print("cnv done") ## get expression if (file.exists(paste0(path, type, "_query.rda"))) { load(paste0(path, type, "_query.rda")) } else { if (length(grep("-AML$|-LAML$", type)) > 0) { sampletype <- c("Primary Blood Derived Cancer - Peripheral Blood", "Solid Tissue Normal") } else { sampletype <- c("Primary solid Tumor", "Solid Tissue Normal") } data <- GDCquery(project = paste(type, sep = ""), sample.type = sampletype, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - Counts" ) GDCdownload(data) if (is.null(data)) { print("is null") next() } data <- GDCprepare(data) save(data, file = paste0(path, type, "_query.rda")) } ## process expression data: D <- assay(data) class <- data@colData@listData$definition print("gene expression done") ## get mutation if (file.exists(paste0(path, type, "_mut.rda"))) { load(paste0(path, type, "_mut.rda")) } else { type2 <- gsub(paste(paste0(unique(gsub("-.*", "", types)), "-"), collapse = "|"), "", type) library(data.table) GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "varscan2") # mutation file GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "muse") # mutation file GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "somaticsniper") # mutation file GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "mutect2") # mutation file mut <- list() count <- 1 type3 <- gsub("-", "\\.", type) for (i in list.files("GDCdata")) { if (length(grep(type3, i)) > 0 & length(grep("csv", i)) > 0) { mut[[count]] <- fread(paste0("GDCdata/", i)) count <- count + 1 } } save(mut, file = paste0(path, type, "_mut.rda")) } ## clinical if (file.exists(paste0(path, type, "_clin.rda"))) { load(paste0(path, type, "_clin.rda")) } else { clinical <- GDCquery_clinic(project = type, type = "clinical") save(clinical, file = paste0(path, type, "_clin.rda")) } ## process mutation data: (https://cancer.sanger.ac.uk/cosmic/help/mutation/overview) if (file.exists(paste0(path, type, "_mut0.rda")) & !newmut) { load(paste0(path, type, "_mut0.rda")) } else { n <- nmut # try something to get all patients with at least one mutation library(snowfall) countSamples <- function(i, mut.org, genes, mut.mat, coln) { i <- which(rownames(mut.mat) %in% genes[i]) tmp <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(mut.mat)[i]), coln] tmp2 <- mut.mat[i, ] tmp2[which(colnames(mut.mat) %in% tmp)] <- 1 return(tmp2) } typeSamples <- function(i, mut.org, genes, type.mat, coln, coln2) { i <- which(rownames(type.mat) %in% genes[i]) tmp <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(type.mat)[i]), coln] tmp3 <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(type.mat)[i]), coln2] tmp2 <- type.mat[i, ] tmp2[which(colnames(type.mat) %in% tmp)] <- tmp3 return(tmp2) } biggl <- list() for (i in length(mut)) { mutation <- mut[[i]] biggl[[i]] <- mutation$Hugo_Symbol } freq <- sort(table(unlist(biggl)), decreasing = TRUE) if (n == 0) { allsub <- names(freq) } else { allsub <- names(freq)[1:n] } M <- Mtype <- list() for (i in 1:length(mut)) { mutation <- mut[[i]] mut.mat <- matrix(0, length(allsub), length(unique(mutation$Tumor_Sample_Barcode))) type.mat <- matrix("", length(allsub), length(unique(mutation$Tumor_Sample_Barcode))) colnames(type.mat) <- colnames(mut.mat) <- sort(unique(mutation$Tumor_Sample_Barcode)) rownames(type.mat) <- rownames(mut.mat) <- allsub coln <- which(colnames(mutation) %in% "Tumor_Sample_Barcode") coln2 <- which(colnames(mutation) %in% "Variant_Classification") mut.org <- mutation[which(mutation$Hugo_Symbol %in% allsub), ] sfInit(parallel = T, cpus = 4) sfExport("mut.mat", "coln") tmp <- sfLapply(as.list(1:length(allsub)), countSamples, mut.org, allsub, mut.mat, coln) sfStop() tmp <- do.call("rbind", tmp) rownames(tmp) <- allsub colnames(tmp) <- colnames(mut.mat) M[[i]] <- tmp sfInit(parallel = T, cpus = 4) sfExport("type.mat", "coln2") tmp <- sfLapply(as.list(1:length(allsub)), typeSamples, mut.org, allsub, type.mat, coln, coln2) sfStop() tmp <- do.call("rbind", tmp) rownames(tmp) <- allsub colnames(tmp) <- colnames(mut.mat) Mtype[[i]] <- tmp } samples <- intersect(intersect(intersect(colnames(M[[1]]), colnames(M[[2]])), colnames(M[[3]])), colnames(M[[4]])) M0 <- M[[1]][, which(colnames(M[[1]]) %in% samples)] Mtype0 <- Mtype[[1]][, which(colnames(Mtype[[1]]) %in% samples)] for (i in 2:length(M)) { M0 <- M0 + M[[i]][, which(colnames(M[[i]]) %in% samples)] Mtype0 <- matrix(paste(Mtype0, Mtype[[i]][, which(colnames(Mtype[[i]]) %in% samples)]), nrow(Mtype0)) } rownames(Mtype0) <- rownames(M0) colnames(Mtype0) <- colnames(M0) save(M0, Mtype0, file = paste0(path, type, "_mut0.rda")) } ## process expression data: D <- assay(data) class <- data@colData@listData$definition M <- M0 Mtype <- Mtype0 colnames(M) <- lapply(colnames(M), function(x) { y <- unlist(strsplit(x, "-")) y <- paste(y[1:4], collapse = "-") y <- unlist(strsplit(y, "")) y <- paste(y[1:(length(y)-1)], collapse = "") return(y) }) colnames(D) <- lapply(colnames(D), function(x) { y <- unlist(strsplit(x, "-")) y <- paste(y[1:4], collapse = "-") y <- unlist(strsplit(y, "")) y <- paste(y[1:(length(y)-1)], collapse = "") return(y) }) colnames(M) <- gsub("A$", "", lapply(strsplit(colnames(M), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) })) M <- M[order(rownames(M)), order(colnames(M))] Mtype <- Mtype[order(rownames(Mtype)), order(colnames(Mtype))] print("mutation done") ## log odds: if (file.exists(paste0(path, type, "_llr.rda")) & !newllr) { load(paste0(path, type, "_llr.rda")) } else { library(edgeR) if (sum(class %in% "Solid Tissue Normal") < 10) { distrParNC <- function(i, data) { data[i, ] <- data[i, ] - median(data[, i]) llrcol <- numeric(ncol(data)) div0 <- quantile(data[i, ], 0.25) div1 <- quantile(data[i, ], 0.75) sigma <- sd(data[i, ]) for (j in 1:ncol(data)) { if (data[i, j] <= 0) { llrcol[j] <- log2(div0/data[i, j]) } else { llrcol[j] <- log2(data[i, j]/div1) } } return(llrcol) } highcounts <- which(apply(D, 1, median) >= 10) DC <- D[highcounts, ] genenames <- rownames(D)[highcounts, ] nf <- calcNormFactors(DC) DC <- t(t(DC)/nf) # DC <- DC2 ## this is very adventurous: ## DC <- t(scale(t(DC))) ## DC <- abs(DC) ## pc <- 1-min(DC) ## tmp <- log2(DC+pc) ## hist(tmp) sfInit(parallel = T, cpus = 4) sfExport("DC") tmp <- do.call("rbind", sfLapply(1:nrow(DC), distrParNC, DC)) colnames(tmp) <- colnames(DC) rownames(tmp) <- genenames sfStop() DF <- DC } else { library(ks) mykcde <- function(x, k) { a <- which.min(abs(k$eval.points - x)) b <- k$estimate[a] b <- min(b, 1-b) return(b) } distrParKs <- function(i, data, C) { llrcol <- numeric(ncol(data)) ddistr <- list() dogenes <- unique(colnames(data))[which(!(unique(colnames(data)) %in% ""))] for (j in dogenes) { D <- which(colnames(data) %in% j) ddistr[[j]] <- kcde(data[i, D]) } cdistr <- kcde(data[i, C]) for (j in which(!(colnames(data) %in% ""))) { gene <- colnames(data)[j] llrcol[j] <- log2(mykcde(data[i, j], ddistr[[gene]])/mykcde(data[i,j], cdistr)) } llrcol <- llrcol[-C] return(llrcol) } DN <- D[, which(class %in% "Solid Tissue Normal")] DT <- D[, which(class %in% "Primary solid Tumor")] DF <- cbind(DT, DN) C <- (ncol(DT)+1):ncol(DF) highcounts <- which(apply(DF, 1, median) >= 10) DF <- DF[highcounts, ] genenames <- rownames(DF) colnames(DF)[1:ncol(DT)] <- "P" # not knock-down specific! colnames(DF)[grep(paste(unique(gsub("-.*", "", types)), collapse = "|"), colnames(DF))] <- "" nf <- calcNormFactors(DF) DF <- t(t(DF)/nf) sfInit(parallel = T, cpus = 4) sfExport("DF", "C", "mykcde") sfLibrary(ks) tmp <- do.call("rbind", sfLapply(1:nrow(DF), distrParKs, DF, C)) sfStop() colnames(tmp) <- colnames(DT) } save(tmp, DF, file = paste0(path, type, "_llr.rda")) } rownames(tmp) <- rownames(DF) par(mfrow=c(1,3)) hist(tmp) tmp[which(is.na(tmp) | is.infinite(tmp))] <- 0 hist(tmp) D <- tmp print("expression done") ## sd.glob <- sd(tmp) ## tmp <- tmp[-which(apply(tmp, 1, sd) < sd.glob), ] ## hist(tmp) ## prep clinical data: clinical[which(clinical$vital_status%in% "dead"), which(colnames(clinical) %in% "vital_status")] <- 1 clinical[which(clinical$vital_status%in% "alive"), which(colnames(clinical) %in% "vital_status")] <- 0 count <- 0 for (stage in sort(unique(clinical$tumor_stage))) { clinical[which(clinical$tumor_stage%in% stage), which(colnames(clinical) %in% "tumor_stage")] <- count count <- count + 1 } clinical$tumor_stage <- as.numeric(clinical$tumor_stage) clinical[which(is.na(clinical$days_to_death)), which(colnames(clinical) %in% "days_to_death")] <- clinical[which(is.na(clinical$days_to_death)), which(colnames(clinical) %in% "days_to_last_follow_up")] clinical$vital_status<- as.numeric(clinical$vital_status) print("clinical done") save(clinical, D, M, Mtype, DF, class, meth, cnv, file = paste0(path, type, "_final.rda")) } print(table(class)) sizemat <- rbind(sizemat, table(class)) rownames(sizemat)[nrow(sizemat)] <- type if (donr) { samplenr[snrcount, 1] <- sum(class %in% "Primary solid Tumor") samplenr[snrcount, 2] <- sum(class %in% "Solid Tissue Normal") } } stop("done") samplenr2 <- samplenr[-which(is.na(samplenr[, 1]) == TRUE), ] barplot(t(samplenr2[order(apply(samplenr2, 1, sum)), ]), horiz = 1, space = 1, las = 2) newllr <- 1; newmut <- 1; nmut <- 1; newsave <- 1; types <- c("TCGA-SKCM","TCGA-UVM"); source("~/Documents/testing/general/TCGA.r") nonesolid <- c("TCGA-LAML") solidnonormal <- c() ## analysis: type <- "TCGA-BRCA" path <- "mutclust/" load(paste0(path, type, "_final.rda")) M[which(M < 3)] <- 0 M[which(M > 0)] <- 1 M[grep("Silent", Mtype)] <- 0 M <- M[order(rownames(M)), order(colnames(M))] cnv[which(cnv == 0)] <- 0 cnv[which(cnv != 0)] <- 1 cnv <- cnv[order(rownames(cnv)), order(colnames(cnv))] meth[which(meth > 0.5)] <- 1 meth[which(meth <= 0.5)] <- 0 meth <- meth[order(rownames(meth)), order(colnames(meth))] meth[is.na(meth)] <- 0 P <- matrix(0, length(unique(c(rownames(M), rownames(cnv), rownames(meth)))), length(unique(c(colnames(M), colnames(cnv), colnames(meth))))) rownames(P) <- sort(unique(c(rownames(M), rownames(cnv), rownames(meth)))) colnames(P) <- sort(unique(c(colnames(M), colnames(cnv), colnames(meth)))) colnames(P) <- gsub("A$", "", lapply(strsplit(colnames(P), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) })) P <- P[, which(duplicated(colnames(P)) == FALSE)] P[which(rownames(P) %in% rownames(M)), which(colnames(P) %in% colnames(M))] <- M[which(rownames(M) %in% rownames(P)), which(colnames(M) %in% colnames(P))] Pmut <- P P <- Pmut*0 P[which(rownames(P) %in% rownames(meth)), which(colnames(P) %in% colnames(meth))] <- P[which(rownames(P) %in% rownames(meth)), which(colnames(P) %in% colnames(meth))] + meth[which(rownames(meth) %in% rownames(P)), which(colnames(meth) %in% colnames(P))] Pmeth <- P P <- Pmeth*0 P[which(rownames(P) %in% rownames(cnv)), which(colnames(P) %in% colnames(cnv))] <- P[which(rownames(P) %in% rownames(cnv)), which(colnames(P) %in% colnames(cnv))] + cnv[which(rownames(cnv) %in% rownames(P)), which(colnames(cnv) %in% colnames(P))] Pcnv <- P P <- Pmut+Pmeth+Pcnv P2 <- P # full abberations including cnv and meth P <- Pmut ## Bailey et al Cell 2018 goi <- c("MAP2K4", "GATA3", "GPS2", "TBX3", "PTPRD", "NCOR1", "CBFB", "CDKN1B") # BRCA P <- P[which(rownames(P) %in% goi), ] P[which(P > 1)] <- 1 P <- apply(P, 2, function(x) return(x/sum(x))) P[is.na(P)] <- 0 ## data imputation: library(naturalsort) library(nem) library(cluster) library(Rcpp) library(Rgraphviz) library(mnem) source("~/Documents/mnem/R/mnems.r") source("~/Documents/mnem/R/mnems_low.r") sourceCpp("~/Documents/mnem/src/mm.cpp") source("~/Documents/nempi/R/nempi_main.r") source("~/Documents/nempi/R/nempi_low.r") Rho <- cbind(P, matrix(0, nrow(P), sum(!(colnames(D) %in% colnames(P))))) colnames(Rho) <- c(colnames(P), colnames(D)[which(!(colnames(D) %in% colnames(P)))]) Rho <- Rho[, colnames(D)] if (sum(apply(Rho, 1, sum) == 0) > 0) { Rho <- Rho[-which(apply(Rho, 1, sum) == 0), ] } Rho[is.na(Rho)] <- 0 sum(apply(Rho, 2, sum) == 0)/ncol(Rho) # unlabelled pdf("temp.pdf", width = 12, height = 6) tmp <- Rho colnames(tmp) <- NULL epiNEM::HeatmapOP(tmp, col = "RdBu", Rowv = 0, bordercol = "transparent") dev.off() inc <- sort(apply(Rho, 1, sum)) D2 <- D[which(apply(D, 1, median) != 0), ] D3 <- D2[, which(duplicated(colnames(D2)) == FALSE)] Rho <- Rho[, which(duplicated(colnames(D2)) == FALSE)] for (i in which(duplicated(colnames(D2)) == TRUE)) { j <- which(colnames(D3) %in% colnames(D2)[i]) D3[, j] <- apply(D2[, which(colnames(D2) %in% colnames(D2)[i]), drop = FALSE], 1, median) } D2 <- D3 colnames(D2) <- c(rownames(Rho), sample(rownames(Rho), ncol(D2)-nrow(Rho), replace = TRUE)) converged <- 10 start <- Sys.time() nempires <- nempi(D2, Gamma = Rho, full = TRUE, converged = converged) end <- Sys.time() print(end - start) ures <- nempires sum(ures$lls[2:length(ures$lls)] - ures$lls[1:(length(ures$lls)-1)] < 0) pdf("temp.pdf", width = 12, height = 6) epiNEM::HeatmapOP(ures$Gamma, bordercol = rgb(0,0,0,0), col = "RdBu") #plot(ures, edgewidth = 30) dev.off() pdf("temp.pdf", width = 9, height = 6) par(mfrow=c(2,3)) plotConvergence(ures, type = "b", col = "blue") dev.off() source("~/Documents/nempi/R/nempi_main.r") D4 <- D2 colnames(D4) <- apply(Rho, 2, function(x) { Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_") return(Sgenes) }) ## run on hpc cluster: path <- "" type <- "TCGA-BRCA" if (do == 1) { library(e1071) load(paste0(path, type, "_nempi.rda")) D4 <- D2 colnames(D4) <- apply(Rho, 2, function(x) { Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_") return(Sgenes) }) svmres <- classpi(D4, full = TRUE, method = "svm") save(svmres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda")) } if (do == 2) { library(nnet) load(paste0(path, type, "_nempi.rda")) D4 <- D2 colnames(D4) <- apply(Rho, 2, function(x) { Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_") return(Sgenes) }) nnres <- classpi(D4, full = TRUE, method = "nnet", MaxNWts = 50000, size = 5) # takes forever save(nnres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda")) } if (do == 3) { library(CALIBERrfimpute) load(paste0(path, type, "_nempi.rda")) D4 <- D2 colnames(D4) <- apply(Rho, 2, function(x) { Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_") return(Sgenes) }) mfdata <- cbind(as.data.frame(t(D4)), colnames(D4)) mfdata[which(mfdata == "", arr.ind = TRUE)] <- NA micedata <- mfdata colnames(micedata) <- paste0(LETTERS[1:ncol(micedata)], 1:ncol(micedata)) miceres <- mice(micedata, method = c(rep('rfcont', ncol(micedata)-1), 'rfcat'), m = 2, maxit = 2) save(miceres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda")) } if (do == 4) { library(e1071) load(paste0(path, type, "_nempi.rda")) D4 <- D2 colnames(D4) <- apply(Rho, 2, function(x) { Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_") return(Sgenes) }) rfres <- classpi(D4, full = TRUE, method = "randomForest") save(rfres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda")) } if (do == 5) { library(e1071) load(paste0(path, type, "_nempi.rda")) D4 <- D2 colnames(D4) <- apply(Rho, 2, function(x) { Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_") return(Sgenes) }) mfdata <- cbind(as.data.frame(t(D4)), colnames(D4)) mfdata[which(mfdata == "", arr.ind = TRUE)] <- NA library(missForest) mfimp <- missForest(mfdata) D4 <- D2 colnames(D4) <- mfimp$ximp[, ncol(mfimp$ximp)] tmp <- mynem(D4, multi = TRUE) Gamma <- getGamma(D4) ures <- list() ures$Gamma <- apply(Gamma, 2, function(x) return(x/sum(x))) ures$res <- list() ures$res$adj <- tmp$adj ures$null <- TRUE ures$combi <- 1 mfres <- ures save(mfres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda")) } ## knn library(class) train <- t(D4[, which(colnames(D4) != "")]) test <- t(D4[, which(colnames(D4) == "")]) knn0 <- 0 if (knn0) { train <- rbind(train, NULL = rep(0, ncol(train))) cl <- c(colnames(D4)[which(colnames(D4) != "")], "NULL") } else { cl <- colnames(D4)[which(colnames(D4) != "")] } knnres <- knn(train, test, cl, prob = TRUE) D3 <- D4 colnames(D3)[which(colnames(D3) %in% "")] <- as.character(knnres) tmp <- mynem(D3, multi = TRUE) Gamma <- getGamma(D3) ures <- list() ures$Gamma <- Gamma # apply(Gamma, 2, function(x) return(x/sum(x))) ures$res <- list() ures$res$adj <- tmp$adj ures$null <- TRUE ures$combi <- 1 knnres <- ures ## save(nempires, knnres, rfres, mfres, svmres, nnres, Rho, D2, Pmut, Pmeth, Pcnv, file = paste0(path, type, "_nempi.rda")) path <- "mutclust/"; type <- "TCGA-BRCA" load(paste0(path, type, "_nempi.rda")) ## ## load("~/Mount/Euler/temp_1_1573814974.40659.rda") # old ## load("~/Mount/Euler/temp_1_1574076694.65703.rda") ## ## load("~/Mount/Euler/temp_4_1573819581.9749.rda") # old ## load("~/Mount/Euler/temp_4_1574080528.67352.rda") ## ## load("~/Mount/Euler/temp_2_1573821112.2412.rda") # old ## load("~/Mount/Euler/temp_2_1574084503.12431.rda") ## load("~/Mount/Euler/temp_5_1574081415.91547.rda") ## ures <- rfres ## ures <- mfres ## ures <- svmres ## ures <- nnres ## ures <- knnres ures <- nempires ## check against methylation and cnvs: pdf("nempi_gamma.pdf", width = 12, height = 6) tmp <- ures$Gamma colnames(tmp) <- NULL epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL) dev.off() pdf("nempi_phi.pdf", width = 6, height = 6) Pgenes <- sort(unique(colnames(D2))) adjtmp <- ures$res$adj colnames(adjtmp) <- rownames(adjtmp) <- Pgenes plotDnf(adjtmp, edgelwd = 2) dev.off() ## cnv/meth enrichment: methods <- list("NEM$\\pi$" = nempires, knn = knnres, mf = mfres, nn = nnres, rf = rfres, svm = svmres) mutinc <- 1 Lall <- Lcnv <- Lmeth <- Lmut <- list() for (i in 1:length(methods)) { if (i != 8) { print(names(methods)[i]) ures <- methods[[i]] newGamma <- ures$Gamma } else { print("random") newGamma <- newGamma*0 newGamma[sample(1:length(newGamma), floor(0.45*length(newGamma)))] <- 1 # well that is included into the test... } hist(newGamma) if (i == 1) { rntmp <- rownames(newGamma); newGamma <- t(mytc(ures$res$adj))%*%newGamma; rownames(newGamma) <- rntmp } P <- Pmut+Pmeth+Pcnv if (!mutinc) { # include mutations or not (0) P[which(Pmut == 1)] <- 0 } P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))] P <- P[order(rownames(P)), order(colnames(P))] P[which(P > 1)] <- 1 Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P))))) colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))]) P <- Ptmp[, colnames(Rho)] ## fisher: if (i %in% c(4,5)) { cut <- 0.07 } else if (i == 6) { cut <- 0.1 } else { cut <- 1/8 } newGamma[which(newGamma > cut)] <- 1 newGamma[which(newGamma <= cut)] <- 0 pmeth <- newGamma pmeth[which(newGamma == 1 & P == 1)] <- 2 pmeth[which(newGamma == 0 & P == 1)] <- -2 if (!mutinc) { # include mutations or not (0) pmeth[which(Rho > 0)] <- 0 } colnames(pmeth) <- NULL ##pdf(paste0("FigS_", names(methods)[i], ".pdf"), height = 6, width = 12) setEPS() postscript(paste0("FigS_", names(methods)[i], ".eps"), height = 6, width = 12) print(epiNEM::HeatmapOP(pmeth, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL)) dev.off() print("cnv") P <- Pcnv P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))] P <- P[order(rownames(P)), order(colnames(P))] P[which(P > 1)] <- 1 Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P))))) colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))]) P <- Ptmp[, colnames(Rho)] F <- matrix(c(sum(pmeth >= 1 & P == 1), sum(pmeth >= 1 & P == 0), sum(pmeth == -2 & P == 1), sum(pmeth == 0 & P == 0)), 2) print(1 - phyper(F[1,1]-1, sum(F[, 1]), sum(F[, 2]), sum(F[1, ]))) Lcnv[[i]] <- F print("meth") P <- Pmeth P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))] P <- P[order(rownames(P)), order(colnames(P))] P[which(P > 1)] <- 1 Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P))))) colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))]) P <- Ptmp[, colnames(Rho)] F <- matrix(c(sum(pmeth >= 1 & P == 1), sum(pmeth >= 1 & P == 0), sum(pmeth == -2 & P == 1), sum(pmeth == 0 & P == 0)), 2) print(1 - phyper(F[1,1]-1, sum(F[, 1]), sum(F[, 2]), sum(F[1, ]))) Lmeth[[i]] <- F print("mut") P <- Pmut P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))] P <- P[order(rownames(P)), order(colnames(P))] P[which(P > 1)] <- 1 Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P))))) colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))]) P <- Ptmp[, colnames(Rho)] F <- matrix(c(sum(pmeth >= 1 & P == 1), sum(pmeth >= 1 & P == 0), sum(pmeth == -2 & P == 1), sum(pmeth == 0 & P == 0)), 2) print(1 - phyper(F[1,1]-1, sum(F[, 1]), sum(F[, 2]), sum(F[1, ]))) Lmut[[i]] <- F Fmat <- matrix(c(sum(pmeth == 2), sum(pmeth == 1), sum(pmeth == -2), sum(pmeth == 0)), 2) Lall[[i]] <- Fmat ## print(fisher.test(Fmat, alternative = "greater")) print("p-value") print(1 - phyper(Fmat[1,1]-1, sum(Fmat[, 1]), sum(Fmat[, 2]), sum(Fmat[1, ]))) } ## create tables: for (i in 1:length(methods)) { cat(paste0(names(methods)[i], " & ", Lcnv[[i]][1,1], " & ", Lcnv[[i]][2,1], " & ", Lcnv[[i]][2,2], " & ", Lcnv[[i]][1,2], "\\\\\n")) } for (i in 1:length(methods)) { cat(paste0(names(methods)[i], " & ", Lmeth[[i]][1,1], " & ", Lmeth[[i]][2,1], " & ", Lmeth[[i]][2,2], " & ", Lmeth[[i]][1,2], "\\\\\n")) } for (i in 1:length(methods)) { cat(paste0(names(methods)[i], " & ", Lmut[[i]][1,1], " & ", Lmut[[i]][2,1], " & ", Lmut[[i]][2,2], " & ", Lmut[[i]][1,2], "\\\\\n")) } for (i in 1:length(methods)) { if (names(methods)[i] == "nn") { next() } Fmat <- Lall[[i]] ptmp <- 1 - phyper(Fmat[1,1]-1, sum(Fmat[, 1]), sum(Fmat[, 2]), sum(Fmat[1, ])) if (ptmp == 0) { ptmp <- "$< 2.2\\times10^{-16}$" } else { ptmp <- paste0("$", signif(ptmp), "$") } cat(paste0(names(methods)[i], " & ", Lall[[i]][1,1], " & ", Lall[[i]][2,1], " & ", Lall[[i]][2,2], " & ", Lall[[i]][1,2], " & ", ptmp, "\\\\\n")) } ## check correlation ## P4 <- apply(P3, 2, function(x) return(x/sum(x))) ## P4[is.na(P4)] <- 0 ## cor(as.vector(newGamma), as.vector(P3)) ## cormat <- matrix(0, nrow(pmeth), 2) fishres <- numeric(nrow(pmeth)) names(fishres) <- rownames(pmeth) for (i in 1:nrow(pmeth)) { Fmat <- matrix(c(sum(pmeth[i, ] == 2), sum(pmeth[i, ] == 1), sum(pmeth[i, ] == -2), sum(pmeth[i, ] == 0)), 2) fishres[i] <- fisher.test(Fmat, alternative = "g")$p.value cormat[i, ] <- c(sum(Fmat[1, ]), sum(Fmat[, 1])) } ## GATA3 & PTPRD: Fmat <- matrix(c(sum(pmeth[3, ] %in% c(2,-2) & pmeth[7, ] %in% c(2,-2)), sum(pmeth[3, ] %in% c(1,0) & pmeth[7, ] %in% c(-2,2)), sum(pmeth[3, ] %in% c(-2,2) & pmeth[7, ] %in% c(1,0)), sum(pmeth[3, ] %in% c(1,0) & pmeth[7, ] %in% c(1,0))), 2) 1 - phyper(Fmat[1,1]-1, sum(Fmat[, 1]), sum(Fmat[, 2]), sum(Fmat[1, ])) ## pca: pca <- princomp(D2) col <- apply(newGamma, 2, sum) col <- col/max(col) K <- kmeans(t(newGamma), nrow(newGamma)) plot(pca$loadings[, 1:2], col = K$cluster)#rgb(col,0,0,1)) ## tsne: sne <- tsne(t(newGamma)) plot(sne, col = K$cluster) ## R profiling: Rprof("temp.txt", line.profiling=TRUE) ures <- nempi(D2[1:20, ], Gamma = Gamma, complete = 1, full = TRUE, converged = converged, combi = combi) Rprof(NULL) summaryRprof("temp.txt", lines = "show")$sampling.time head(summaryRprof("temp.txt", lines = "show")$by.self, 10) ## type <- "TCGA-BRCA" load(paste0(path, type, "_nempi.rda")) pdf("Fig5.pdf", width = 10, height = 10) plot(uresn, edgelwd = 2) dev.off() pdf("Fig6.pdf", width = 10, height = 5) tmp <- uresn$Gamma colnames(tmp) <- NULL epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL) dev.off() pdf("Fig7.pdf", width = 10, height = 10) phitmp <- mytc(uresn$res$adj) tmp <- t(phitmp)%*%uresn$Gamma colnames(tmp) <- NULL rownames(tmp) <- rownames(uresn$Gamma) tmp2 <- tmp colnames(tmp) <- NULL tmp <- Gamma colnames(tmp) <- NULL tmp3 <- tmp tmp4 <- tmp2 p1 <- epiNEM::HeatmapOP(tmp2, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL) p2 <- epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL) print(p1, position=c(0, .5, 1, 1), more=TRUE) print(p2, position=c(0, 0, 1, .5)) sum(tmp == 1 & tmp2 == 1)/sum(tmp == 1) dev.off() pdf("Fig8.pdf", width = 10, height = 10) plot(uresf, edgelwd = 2) dev.off() pdf("Fig9.pdf", width = 10, height = 5) tmp <- uresf$Gamma colnames(tmp) <- NULL epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL) dev.off() pdf("Fig10.pdf", width = 10, height = 10) phitmp <- mytc(uresf$res$adj) tmp <- t(phitmp)%*%uresf$Gamma colnames(tmp) <- NULL rownames(tmp) <- rownames(uresf$Gamma) tmp2 <- tmp colnames(tmp) <- NULL tmp <- Gamma colnames(tmp) <- NULL p1 <- epiNEM::HeatmapOP(tmp2, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL) p2 <- epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL) print(p1, position=c(0, .5, 1, 1), more=TRUE) print(p2, position=c(0, 0, 1, .5)) sum(tmp == 1 & tmp2 == 1)/sum(tmp == 1) dev.off() ## new figure: P <- t(mytc(uresf$res$adj))%*%uresf$Gamma rownames(P) <- rownames(uresf$Gamma) PM <- P PM[which(PM > 1/6 & Gamma == 1)] <- P[which(PM > 1/6 & Gamma == 1)] + 1 PM[which(PM <= 1/6 & Gamma == 1)] <- P[which(PM <= 1/6 & Gamma == 1)] - 1 epiNEM::HeatmapOP(PM, bordercol = rgb(0,0,0,0), col = "RdYlBu", breaks = seq(-1,2,length.out=5), clusterx = tmp2) ## other plots: pdf("temp.pdf", width = 10, height = 10) plotDnf(c("M1=M4", "M2=M4", "M3=M5", "M1=M5"), edgelwd = 2) dev.off() M <- matrix(0, 5, 10) rownames(M) <- paste0("M", 1:5) colnames(M) <- paste0("S", 1:10) M[1, 1:4] <- 1 M[2, c(2,7:9)] <- 1 M[3, 5:6] <- 1 M[3, 10] <- 1 phi <- matrix(0, 5, 5) diag(phi) <- 1 phi[1, 1:5] <- 1 phi[2, 3] <- phi[4, 5] <- 1 ## M <- t(phi)%*%M; M[M > 1] <- 1 rownames(M) <- paste0("M", 1:5) pdf("temp.pdf", width = 8, height = 4) epiNEM::HeatmapOP(M, Colv = 0, Rowv = 0, col = "RdBu", colorkey = NULL) dev.off() ## check for mutation type (das führt zu nichts) colnames(Mtype) <- unlist(lapply(strsplit(colnames(Mtype), "-"), function(x) { y <- x[1:3] y <- paste(c(y, "01"), collapse = "-") return(y) })) checkgene <- "CBFB" van <- intersect(colnames(Gamma)[which(PM[checkgene, ] < 0)], colnames(Mtype)) A <- sum(unlist(lapply(strsplit(Mtype[checkgene, van], " "), function(x) if ("Silent" %in% x) { return(1) } else { return(0) }))) B <- length(van) - A van <- intersect(colnames(Gamma)[which(PM[checkgene, ] > 0)], colnames(Mtype)) C <- sum(unlist(lapply(strsplit(Mtype[checkgene, van], " "), function(x) if ("Silent" %in% x) { return(1) } else { return(0) }))) D <- length(van) - C table(unlist(lapply(strsplit(Mtype[checkgene, van], " "), function(x) return(names(table(x))[which.max(table(x))])))) ## table(unlist(strsplit(as.vector(Mtype), " "))) ## pdf("Fig10.pdf", width = 10, height = 10) ## p1 <- epiNEM::HeatmapOP(tmp4, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL) ## p2 <- epiNEM::HeatmapOP(tmp3, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL) ## p3 <- epiNEM::HeatmapOP(tmp2, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL) ## p4 <- epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL) ## print(p1, position=c(0, .5, .5, 1), more=TRUE) ## print(p2, position=c(0, 0, .5, .5), more=TRUE) ## print(p3, position=c(.5, .5, 1, 1), more=TRUE) ## print(p4, position=c(.5, 0, 1, .5)) ## sum(tmp == 1 & tmp2 == 1)/sum(tmp == 1) ## dev.off() source("~/Documents/testing/R/nempi.r") source("~/Documents/testing/R/nempi_low.r") bsres <- unembs(D2, Gamma = Gamma, complete = 1, full = TRUE, converged = converged, combi = combi, bsruns = 10, bssize = 0.5) pdf("temp.pdf", width = 10, height = 5) epiNEM::HeatmapOP(bsres$Gamma, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL) dev.off() pdf("temp.pdf", width = 20, height = 10) par(mfrow=c(1,2)) tmp <- bsres$phi tmp[which(tmp > 0)] <- 1 diag(tmp) <- 0 freqtop <- as.vector(t(bsres$phi)[which(lower.tri(bsres$phi) == TRUE)])/10 freqtop <- freqtop[which(freqtop != 0)] tmptop <- tmp tmptop[lower.tri(tmptop)] <- 0 tmptop <- adj2dnf(tmptop) tmptop <- tmptop[grep("=", tmptop)] plotDnf(tmptop, edgelwd = 2, edgelab = freqtop, freq = freqtop) freqbot <- as.vector(t(bsres$phi)[which(upper.tri(bsres$phi) == TRUE)])/10 freqbot <- freqbot[which(freqbot != 0)] tmpbot <- tmp tmpbot[upper.tri(tmpbot)] <- 0 tmpbot <- adj2dnf(tmpbot) tmpbot <- tmpbot[grep("=", tmpbot)] plotDnf(tmpbot, edgelwd = 2, edgelab = freqbot, freq = freqbot) dev.off() ## source("testing/vignettes/TCGA_cluster.r") pcares <- prcomp(tmp) library(naturalsort) library(nem) library(cluster) library(Rcpp) source("~/Documents/mnem/R/mnems.r") source("~/Documents/mnem/R/mnems_low.r") sourceCpp("~/Documents/mnem/src/mm.cpp") res <- mnem(tmp, starts = 10, search = "greedy", type = "cluster3", complete = 1, multi = 1) tmp2 <- tmp Rprof("temp.txt", line.profiling=TRUE) res <- mnem(tmp2, starts = 10, search = "greedy", type = "cluster3", complete = 1, multi = 1, k = 2) Rprof(NULL) summaryRprof("temp.txt", lines = "show")$sampling.time head(summaryRprof("temp.txt", lines = "show")$by.self) ## resk <- mnemk(tmp2, starts = 10, search = "estimate", type = "cluster3", complete = 1, multi = 1) cluster <- apply(getAffinity(res$probs, mw = res$mw, complete = TRUE), 2, which.max) par(mfrow=c(1,2)) plot(pcares$rotation[, 1:2], col = cluster) names(cluster) <- gsub("-01$|-03$", "", colnames(M)) cluster <- cluster[which(names(cluster) %in% clinical$submitter_id)] cluster <- cluster[order(names(cluster))] clinical <- rbind(clinical, clinical[which(clinical[, 1] %in% names(cluster)[which(duplicated(names(cluster)))]), ]) clinical <- clinical[order(clinical[, 1]), ] print(all(clinical[, 1] == names(cluster))) fit <- survfit(Surv(days_to_death, vital_status) ~ cluster, clinical) plot(fit, col = 1:length(table(cluster)), lty = 1:length(table(cluster))) legend(max(clinical$days_to_death, na.rm = TRUE), 1, 1:length(table(cluster)), lty = 1:length(table(cluster)), col = 1:length(table(cluster)), xjust = 1, yjust = 1) fit <- coxph(Surv(days_to_death, vital_status) ~ cluster + age_at_diagnosis, clinical) print(fit) fit <- coxph(Surv(days_to_death, vital_status) ~ cluster + age_at_diagnosis + tumor_stage, clinical) print(fit) fit <- coxph(Surv(days_to_death, vital_status) ~ cluster + tumor_stage, clinical) print(fit) fit <- coxph(Surv(days_to_death, vital_status) ~ cluster, clinical) print(fit) kres <- clustNEM(tmp, nem = 0, k = length(res$comp), nstart = 10) ## kres <- clustNEM(tmp, nem = 0, nstart = 10) plot(pcares$rotation[, 1:2], col = kres$cluster) kcluster <- kres$cluster names(kcluster) <- gsub("-01$|-03$", "", colnames(M)) kcluster <- kcluster[which(names(kcluster) %in% clinical$submitter_id)] kcluster <- kcluster[order(names(kcluster))] fit <- survfit(Surv(days_to_death, vital_status) ~ kcluster, clinical) plot(fit, col = 1:length(table(cluster)), lty = 1:length(table(cluster))) legend(max(clinical$days_to_death, na.rm = TRUE), 1, 1:length(table(cluster)), lty = 1:length(table(cluster)), col = 1:length(table(cluster)), xjust = 1, yjust = 1) fit <- coxph(Surv(days_to_death, vital_status) ~ kcluster + age_at_diagnosis, clinical) print(fit) fit <- coxph(Surv(days_to_death, vital_status) ~ kcluster, clinical) print(fit)