Browse code

added scripts for data production for publication

mpirkl authored on 28/04/2021 10:49:30
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,1165 @@
1
+
2
+library(snowfall)
3
+library(TCGAbiolinks)
4
+
5
+if (is.null(types)) {
6
+    types <- TCGAbiolinks:::getGDCprojects()$project_id
7
+    ## special "FM-AD"
8
+    dont <- paste0(unique(gsub("-.*", "", types)), "-")
9
+    dont <- dont[-grep("TCGA", dont)]
10
+    samplenr <- matrix(NA, length(types), 2)
11
+    rownames(samplenr) <- types
12
+    donr <- TRUE
13
+    snrcount <- 0
14
+} else {
15
+    dont <- "AGCT"
16
+    donr <- FALSE
17
+}
18
+
19
+sizemat <- matrix(0, 1, 2)
20
+colnames(sizemat) <- c("Tumor", "Normal")
21
+rownames(sizemat) <- ""
22
+
23
+path <- "mutclust/"
24
+
25
+for (type in types) {
26
+    if (donr) {
27
+        snrcount <- snrcount + 1
28
+    }
29
+    if (length(grep(paste(dont, collapse = "|"), type)) > 0) { next() }
30
+    print(type)
31
+    if (file.exists(paste0(path, type, "_final.rda")) & !newmut & !newllr & !newsave) {
32
+        load(paste0(path, type, "_final.rda"))
33
+    } else {
34
+
35
+        summ <- TCGAbiolinks:::getProjectSummary(type)
36
+        library(SummarizedExperiment)
37
+
38
+        ## get methylation:
39
+        if (file.exists(paste0(path, type, "_meth.rda"))) {
40
+            load(paste0(path, type, "_meth.rda"))
41
+        } else {
42
+            data <- GDCquery(project = paste(type, sep = ""),
43
+                             sample.type = "Primary Tumor",
44
+                             data.category = "DNA methylation",
45
+                             data.type = "Methylation beta value",
46
+                             legacy = TRUE
47
+                             )
48
+            GDCdownload(data)
49
+            data <- GDCprepare(data, summarizedExperiment = 0)
50
+            ## map methy sites to genes:
51
+            library(methyAnalysis)
52
+            if (is.data.frame(data)) {
53
+                meth2 <- as.matrix(data[, -(1:3)])
54
+                rownames(meth2) <- gsub(";.*", "", data[, 1])
55
+            } else {
56
+                meth <- data@rowRanges
57
+                library(TxDb.Hsapiens.UCSC.hg19.knownGene)
58
+                probe2gene <- annotateDMRInfo(meth, 'TxDb.Hsapiens.UCSC.hg19.knownGene')
59
+                meth2 <- assay(data)
60
+                rownames(meth2) <- probe2gene$sigDMRInfo@elementMetadata@listData$GeneSymbol
61
+            }
62
+            meth <- meth2
63
+            meth <- meth[which(apply(meth, 1, function(x) return(any(is.na(x)))) == FALSE), ]
64
+            methm <- meth[which(duplicated(rownames(meth)) == FALSE), ]
65
+            count <- 0
66
+            for (i in which(duplicated(rownames(meth)) == FALSE)) {
67
+                count <- count + 1
68
+                methm[count, ] <- apply(meth[which(rownames(meth) %in% rownames(methm)[i]), , drop = FALSE], 2, median)
69
+            }
70
+            meth <- methm
71
+            meth <- meth[order(rownames(meth)), order(colnames(meth))]
72
+            colnames(meth) <- gsub("A$", "", lapply(strsplit(colnames(meth), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
73
+            methm <- meth[, which(duplicated(colnames(meth)) == FALSE)]
74
+            for (i in which(duplicated(colnames(meth)) == TRUE)) {
75
+                j <- which(colnames(methm) == colnames(meth)[i])
76
+                methm[, j] <- apply(meth[, which(colnames(meth) %in% colnames(methm)[i]), drop = FALSE], 2, median)
77
+            }
78
+            meth <- methm
79
+            meth <- meth[order(rownames(meth)), order(colnames(meth))]
80
+            save(meth, file = paste0(path, type, "_meth.rda"))
81
+        }
82
+
83
+        print("meth done")
84
+
85
+        ## get copy number variation:
86
+        if (file.exists(paste0(path, type, "_cnv.rda"))) {
87
+            load(paste0(path, type, "_cnv.rda"))
88
+        } else {
89
+            data <- getGistic(gsub("TCGA-", "", type), type = "thresholded")
90
+            ## data <- GDCquery(project = paste(type, sep = ""),
91
+            ##                  sample.type = "Primary solid Tumor",
92
+            ##                  data.category = "Copy Number Variation",
93
+            ##                  data.type = "Copy Number Segment",
94
+            ##                  )
95
+            ## GDCdownload(data)
96
+            ## data <- GDCprepare(data)
97
+            cnv <- data[, -(1:3)]
98
+            cnv <- apply(cnv, c(1,2), as.numeric)
99
+            rownames(cnv) <- data[, 1]
100
+            colnames(cnv) <- gsub("A$", "", lapply(strsplit(colnames(cnv), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
101
+            cnv <- cnv[order(rownames(cnv)), order(colnames(cnv))]
102
+            save(cnv, file = paste0(path, type, "_cnv.rda"))
103
+        }
104
+
105
+        print("cnv done")
106
+
107
+        ## get expression
108
+        if (file.exists(paste0(path, type, "_query.rda"))) {
109
+            load(paste0(path, type, "_query.rda"))
110
+        } else {
111
+            if (length(grep("-AML$|-LAML$", type)) > 0) {
112
+                sampletype <- c("Primary Blood Derived Cancer - Peripheral Blood", "Solid Tissue Normal")
113
+            } else {
114
+                sampletype <- c("Primary solid Tumor", "Solid Tissue Normal")
115
+            }
116
+            data <- GDCquery(project = paste(type, sep = ""),
117
+                             sample.type = sampletype,
118
+                             data.category = "Transcriptome Profiling",
119
+                             data.type = "Gene Expression Quantification",
120
+                             workflow.type = "HTSeq - Counts"
121
+                             )
122
+            GDCdownload(data)
123
+            if (is.null(data)) {
124
+                print("is null")
125
+                next()
126
+            }
127
+            data <- GDCprepare(data)
128
+            save(data, file = paste0(path, type, "_query.rda"))
129
+        }
130
+        ## process expression data:
131
+        D <- assay(data)
132
+        class <- data@colData@listData$definition
133
+
134
+        print("gene expression done")
135
+
136
+        ## get mutation
137
+        if (file.exists(paste0(path, type, "_mut.rda"))) {
138
+            load(paste0(path, type, "_mut.rda"))
139
+        } else {
140
+            type2 <- gsub(paste(paste0(unique(gsub("-.*", "", types)), "-"), collapse = "|"), "", type)
141
+            library(data.table)
142
+            GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "varscan2") # mutation file
143
+            GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "muse") # mutation file
144
+            GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "somaticsniper") # mutation file
145
+            GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "mutect2") # mutation file
146
+            mut <- list()
147
+            count <- 1
148
+            type3 <- gsub("-", "\\.", type)
149
+            for (i in list.files("GDCdata")) {
150
+                if (length(grep(type3, i)) > 0 & length(grep("csv", i)) > 0) {
151
+                    mut[[count]] <- fread(paste0("GDCdata/", i))
152
+                    count <- count + 1
153
+                }
154
+            }
155
+            save(mut, file = paste0(path, type, "_mut.rda"))
156
+        }
157
+
158
+        ## clinical
159
+        if (file.exists(paste0(path, type, "_clin.rda"))) {
160
+            load(paste0(path, type, "_clin.rda"))
161
+        } else {
162
+            clinical <- GDCquery_clinic(project = type, type = "clinical")
163
+            save(clinical, file = paste0(path, type, "_clin.rda"))
164
+        }
165
+
166
+        ## process mutation data: (https://cancer.sanger.ac.uk/cosmic/help/mutation/overview)
167
+        if (file.exists(paste0(path, type, "_mut0.rda")) & !newmut) {
168
+            load(paste0(path, type, "_mut0.rda"))
169
+        } else {
170
+            n <- nmut # try something to get all patients with at least one mutation
171
+            library(snowfall)
172
+            countSamples <- function(i, mut.org, genes, mut.mat, coln) {
173
+                i <- which(rownames(mut.mat) %in% genes[i])
174
+                tmp <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(mut.mat)[i]), coln]
175
+                tmp2 <- mut.mat[i, ]
176
+                tmp2[which(colnames(mut.mat) %in% tmp)] <- 1
177
+                return(tmp2)
178
+            }
179
+            typeSamples <- function(i, mut.org, genes, type.mat, coln, coln2) {
180
+                i <- which(rownames(type.mat) %in% genes[i])
181
+                tmp <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(type.mat)[i]), coln]
182
+                tmp3 <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(type.mat)[i]), coln2]
183
+                tmp2 <- type.mat[i, ]
184
+                tmp2[which(colnames(type.mat) %in% tmp)] <- tmp3
185
+                return(tmp2)
186
+            }
187
+            biggl <- list()
188
+            for (i in length(mut)) {
189
+                mutation <- mut[[i]]
190
+                biggl[[i]] <- mutation$Hugo_Symbol
191
+            }
192
+            freq <- sort(table(unlist(biggl)), decreasing = TRUE)
193
+            if (n == 0) {
194
+                allsub <- names(freq)
195
+            } else {
196
+                allsub <- names(freq)[1:n]
197
+            }
198
+            M <- Mtype <- list()
199
+            for (i in 1:length(mut)) {
200
+                mutation <- mut[[i]]
201
+                mut.mat <- matrix(0, length(allsub), length(unique(mutation$Tumor_Sample_Barcode)))
202
+                type.mat <- matrix("", length(allsub), length(unique(mutation$Tumor_Sample_Barcode)))
203
+                colnames(type.mat) <- colnames(mut.mat) <- sort(unique(mutation$Tumor_Sample_Barcode))
204
+                rownames(type.mat) <- rownames(mut.mat) <- allsub
205
+                coln <- which(colnames(mutation) %in% "Tumor_Sample_Barcode")
206
+                coln2 <- which(colnames(mutation) %in% "Variant_Classification")
207
+                mut.org <- mutation[which(mutation$Hugo_Symbol %in% allsub), ]
208
+                sfInit(parallel = T, cpus = 4)
209
+                sfExport("mut.mat", "coln")
210
+                tmp <- sfLapply(as.list(1:length(allsub)), countSamples, mut.org, allsub, mut.mat, coln)
211
+                sfStop()
212
+                tmp <- do.call("rbind", tmp)
213
+                rownames(tmp) <- allsub
214
+                colnames(tmp) <- colnames(mut.mat)
215
+                M[[i]] <- tmp
216
+                sfInit(parallel = T, cpus = 4)
217
+                sfExport("type.mat", "coln2")
218
+                tmp <- sfLapply(as.list(1:length(allsub)), typeSamples, mut.org, allsub, type.mat, coln, coln2)
219
+                sfStop()
220
+                tmp <- do.call("rbind", tmp)
221
+                rownames(tmp) <- allsub
222
+                colnames(tmp) <- colnames(mut.mat)
223
+                Mtype[[i]] <- tmp
224
+            }
225
+            samples <- intersect(intersect(intersect(colnames(M[[1]]), colnames(M[[2]])), colnames(M[[3]])), colnames(M[[4]]))
226
+            M0 <- M[[1]][, which(colnames(M[[1]]) %in% samples)]
227
+            Mtype0 <- Mtype[[1]][, which(colnames(Mtype[[1]]) %in% samples)]
228
+            for (i in 2:length(M)) {
229
+                M0 <- M0 + M[[i]][, which(colnames(M[[i]]) %in% samples)]
230
+                Mtype0 <- matrix(paste(Mtype0, Mtype[[i]][, which(colnames(Mtype[[i]]) %in% samples)]), nrow(Mtype0))
231
+            }
232
+            rownames(Mtype0) <- rownames(M0)
233
+            colnames(Mtype0) <- colnames(M0)
234
+            save(M0, Mtype0, file = paste0(path, type, "_mut0.rda"))
235
+        }
236
+
237
+        ## process expression data:
238
+        D <- assay(data)
239
+        class <- data@colData@listData$definition
240
+        M <- M0
241
+        Mtype <- Mtype0
242
+        colnames(M) <- lapply(colnames(M), function(x) {
243
+            y <- unlist(strsplit(x, "-"))
244
+            y <- paste(y[1:4], collapse = "-")
245
+            y <- unlist(strsplit(y, ""))
246
+            y <- paste(y[1:(length(y)-1)], collapse = "")
247
+            return(y)
248
+        })
249
+        colnames(D) <- lapply(colnames(D), function(x) {
250
+            y <- unlist(strsplit(x, "-"))
251
+            y <- paste(y[1:4], collapse = "-")
252
+            y <- unlist(strsplit(y, ""))
253
+            y <- paste(y[1:(length(y)-1)], collapse = "")
254
+            return(y)
255
+        })
256
+        colnames(M) <- gsub("A$", "", lapply(strsplit(colnames(M), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
257
+        M <- M[order(rownames(M)), order(colnames(M))]
258
+        Mtype <- Mtype[order(rownames(Mtype)), order(colnames(Mtype))]
259
+
260
+        print("mutation done")
261
+
262
+        ## log odds:
263
+        if (file.exists(paste0(path, type, "_llr.rda")) & !newllr) {
264
+            load(paste0(path, type, "_llr.rda"))
265
+        } else {
266
+            library(edgeR)
267
+            if (sum(class %in% "Solid Tissue Normal") < 10) {
268
+                distrParNC <- function(i, data) {
269
+                    data[i, ] <- data[i, ] - median(data[, i])
270
+                    llrcol <- numeric(ncol(data))
271
+                    div0 <- quantile(data[i, ], 0.25)
272
+                    div1 <- quantile(data[i, ], 0.75)
273
+                    sigma <- sd(data[i, ])
274
+                    for (j in 1:ncol(data)) {
275
+                        if (data[i, j] <= 0) {
276
+                            llrcol[j] <- log2(div0/data[i, j])
277
+                        } else {
278
+                            llrcol[j] <- log2(data[i, j]/div1)
279
+                        }
280
+                    }
281
+                    return(llrcol)
282
+                }
283
+                highcounts <- which(apply(D, 1, median) >= 10)
284
+                DC <- D[highcounts, ]
285
+                genenames <- rownames(D)[highcounts, ]
286
+                nf <- calcNormFactors(DC)
287
+                DC <- t(t(DC)/nf) # DC <- DC2
288
+                ## this is very adventurous:
289
+                ## DC <- t(scale(t(DC)))
290
+                ## DC <- abs(DC)
291
+                ## pc <- 1-min(DC)
292
+                ## tmp <- log2(DC+pc)
293
+                ## hist(tmp)
294
+                sfInit(parallel = T, cpus = 4)
295
+                sfExport("DC")
296
+                tmp <- do.call("rbind", sfLapply(1:nrow(DC),
297
+                                                 distrParNC, DC))
298
+                colnames(tmp) <- colnames(DC)
299
+                rownames(tmp) <- genenames
300
+                sfStop()
301
+                DF <- DC
302
+            } else {
303
+                library(ks)
304
+                mykcde <- function(x, k) {
305
+                    a <- which.min(abs(k$eval.points - x))
306
+                    b <- k$estimate[a]
307
+                    b <- min(b, 1-b)
308
+                    return(b)
309
+                }
310
+                distrParKs <- function(i, data, C) {
311
+                    llrcol <- numeric(ncol(data))
312
+                    ddistr <- list()
313
+                    dogenes <- unique(colnames(data))[which(!(unique(colnames(data)) %in% ""))]
314
+                    for (j in dogenes) {
315
+                        D <- which(colnames(data) %in% j)
316
+                        ddistr[[j]] <- kcde(data[i, D])
317
+                    }
318
+                    cdistr <- kcde(data[i, C])
319
+                    for (j in which(!(colnames(data) %in% ""))) {
320
+                        gene <- colnames(data)[j]
321
+                        llrcol[j] <- log2(mykcde(data[i, j], ddistr[[gene]])/mykcde(data[i,j], cdistr))
322
+                    }
323
+                    llrcol <- llrcol[-C]
324
+                    return(llrcol)
325
+                }
326
+                DN <- D[, which(class %in% "Solid Tissue Normal")]
327
+                DT <- D[, which(class %in% "Primary solid Tumor")]
328
+                DF <- cbind(DT, DN)
329
+                C <- (ncol(DT)+1):ncol(DF)
330
+                highcounts <- which(apply(DF, 1, median) >= 10)
331
+                DF <- DF[highcounts, ]
332
+                genenames <- rownames(DF)
333
+                colnames(DF)[1:ncol(DT)] <- "P" # not knock-down specific!
334
+                colnames(DF)[grep(paste(unique(gsub("-.*", "", types)), collapse = "|"), colnames(DF))] <- ""
335
+                nf <- calcNormFactors(DF)
336
+                DF <- t(t(DF)/nf)
337
+                sfInit(parallel = T, cpus = 4)
338
+                sfExport("DF", "C", "mykcde")
339
+                sfLibrary(ks)
340
+                tmp <- do.call("rbind", sfLapply(1:nrow(DF),
341
+                                                 distrParKs, DF, C))
342
+                sfStop()
343
+                colnames(tmp) <- colnames(DT)
344
+            }
345
+            save(tmp, DF, file = paste0(path, type, "_llr.rda"))
346
+        }
347
+
348
+        rownames(tmp) <- rownames(DF)
349
+
350
+        par(mfrow=c(1,3))
351
+        hist(tmp)
352
+
353
+        tmp[which(is.na(tmp) | is.infinite(tmp))] <- 0
354
+
355
+        hist(tmp)
356
+
357
+        D <- tmp
358
+
359
+        print("expression done")
360
+
361
+        ## sd.glob <- sd(tmp)
362
+
363
+        ## tmp <- tmp[-which(apply(tmp, 1, sd) < sd.glob), ]
364
+
365
+        ## hist(tmp)
366
+
367
+        ## prep clinical data:
368
+        clinical[which(clinical$vital_status%in% "dead"), which(colnames(clinical) %in% "vital_status")] <- 1
369
+        clinical[which(clinical$vital_status%in% "alive"), which(colnames(clinical) %in% "vital_status")] <- 0
370
+        count <- 0
371
+        for (stage in sort(unique(clinical$tumor_stage))) {
372
+            clinical[which(clinical$tumor_stage%in% stage), which(colnames(clinical) %in% "tumor_stage")] <- count
373
+            count <- count + 1
374
+        }
375
+        clinical$tumor_stage <- as.numeric(clinical$tumor_stage)
376
+        clinical[which(is.na(clinical$days_to_death)), which(colnames(clinical) %in% "days_to_death")] <- clinical[which(is.na(clinical$days_to_death)), which(colnames(clinical) %in% "days_to_last_follow_up")]
377
+        clinical$vital_status<- as.numeric(clinical$vital_status)
378
+
379
+        print("clinical done")
380
+
381
+        save(clinical, D, M, Mtype, DF, class, meth, cnv, file = paste0(path, type, "_final.rda"))
382
+    }
383
+    print(table(class))
384
+    sizemat <- rbind(sizemat, table(class))
385
+    rownames(sizemat)[nrow(sizemat)] <- type
386
+    if (donr) {
387
+        samplenr[snrcount, 1] <- sum(class %in% "Primary solid Tumor")
388
+        samplenr[snrcount, 2] <- sum(class %in% "Solid Tissue Normal")
389
+    }
390
+}
391
+
392
+stop("done")
393
+
394
+samplenr2 <- samplenr[-which(is.na(samplenr[, 1]) == TRUE), ]
395
+barplot(t(samplenr2[order(apply(samplenr2, 1, sum)), ]), horiz = 1, space = 1, las = 2)
396
+
397
+newllr <- 1; newmut <- 1; nmut <- 1; newsave <- 1; types <- c("TCGA-SKCM","TCGA-UVM"); source("~/Documents/testing/general/TCGA.r")
398
+
399
+nonesolid <- c("TCGA-LAML")
400
+solidnonormal <- c()
401
+
402
+## analysis:
403
+
404
+type <- "TCGA-BRCA"
405
+
406
+path <- "mutclust/"
407
+
408
+load(paste0(path, type, "_final.rda"))
409
+
410
+M[which(M < 3)] <- 0
411
+M[which(M > 0)] <- 1
412
+
413
+M[grep("Silent", Mtype)] <- 0
414
+M <- M[order(rownames(M)), order(colnames(M))]
415
+
416
+cnv[which(cnv == 0)] <- 0
417
+cnv[which(cnv != 0)] <- 1
418
+cnv <- cnv[order(rownames(cnv)), order(colnames(cnv))]
419
+
420
+meth[which(meth > 0.5)] <- 1
421
+meth[which(meth <= 0.5)] <- 0
422
+meth <- meth[order(rownames(meth)), order(colnames(meth))]
423
+meth[is.na(meth)] <- 0
424
+
425
+P <- matrix(0, length(unique(c(rownames(M), rownames(cnv), rownames(meth)))), length(unique(c(colnames(M), colnames(cnv), colnames(meth)))))
426
+rownames(P) <- sort(unique(c(rownames(M), rownames(cnv), rownames(meth))))
427
+colnames(P) <- sort(unique(c(colnames(M), colnames(cnv), colnames(meth))))
428
+colnames(P) <- gsub("A$", "", lapply(strsplit(colnames(P), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
429
+P <- P[, which(duplicated(colnames(P)) == FALSE)]
430
+
431
+P[which(rownames(P) %in% rownames(M)), which(colnames(P) %in% colnames(M))] <- M[which(rownames(M) %in% rownames(P)), which(colnames(M) %in% colnames(P))]
432
+
433
+Pmut <- P
434
+
435
+P <- Pmut*0
436
+
437
+P[which(rownames(P) %in% rownames(meth)), which(colnames(P) %in% colnames(meth))] <- P[which(rownames(P) %in% rownames(meth)), which(colnames(P) %in% colnames(meth))] + meth[which(rownames(meth) %in% rownames(P)), which(colnames(meth) %in% colnames(P))]
438
+
439
+Pmeth <- P
440
+
441
+P <- Pmeth*0
442
+
443
+P[which(rownames(P) %in% rownames(cnv)), which(colnames(P) %in% colnames(cnv))] <- P[which(rownames(P) %in% rownames(cnv)), which(colnames(P) %in% colnames(cnv))] + cnv[which(rownames(cnv) %in% rownames(P)), which(colnames(cnv) %in% colnames(P))]
444
+
445
+Pcnv <- P
446
+
447
+P <- Pmut+Pmeth+Pcnv
448
+
449
+P2 <- P # full abberations including cnv and meth
450
+
451
+P <- Pmut
452
+
453
+## Bailey et al Cell 2018
454
+
455
+goi <- c("MAP2K4", "GATA3", "GPS2", "TBX3", "PTPRD", "NCOR1", "CBFB", "CDKN1B") # BRCA
456
+
457
+P <- P[which(rownames(P) %in% goi), ]
458
+
459
+P[which(P > 1)] <- 1
460
+P <- apply(P, 2, function(x) return(x/sum(x)))
461
+P[is.na(P)] <- 0
462
+
463
+## data imputation:
464
+
465
+library(naturalsort)
466
+library(nem)
467
+library(cluster)
468
+library(Rcpp)
469
+library(Rgraphviz)
470
+library(mnem)
471
+
472
+source("~/Documents/mnem/R/mnems.r")
473
+source("~/Documents/mnem/R/mnems_low.r")
474
+sourceCpp("~/Documents/mnem/src/mm.cpp")
475
+
476
+source("~/Documents/nempi/R/nempi_main.r")
477
+source("~/Documents/nempi/R/nempi_low.r")
478
+
479
+Rho <- cbind(P, matrix(0, nrow(P), sum(!(colnames(D) %in% colnames(P)))))
480
+
481
+colnames(Rho) <- c(colnames(P), colnames(D)[which(!(colnames(D) %in% colnames(P)))])
482
+
483
+Rho <- Rho[, colnames(D)]
484
+
485
+if (sum(apply(Rho, 1, sum) == 0) > 0) {
486
+    Rho <- Rho[-which(apply(Rho, 1, sum) == 0), ]
487
+}
488
+
489
+Rho[is.na(Rho)] <- 0
490
+
491
+sum(apply(Rho, 2, sum) == 0)/ncol(Rho) # unlabelled
492
+
493
+pdf("temp.pdf", width = 12, height = 6)
494
+tmp <- Rho
495
+colnames(tmp) <- NULL
496
+epiNEM::HeatmapOP(tmp, col = "RdBu", Rowv = 0, bordercol = "transparent")
497
+dev.off()
498
+
499
+inc <- sort(apply(Rho, 1, sum))
500
+
501
+D2 <- D[which(apply(D, 1, median) != 0), ]
502
+
503
+D3 <- D2[, which(duplicated(colnames(D2)) == FALSE)]
504
+Rho <- Rho[, which(duplicated(colnames(D2)) == FALSE)]
505
+for (i in which(duplicated(colnames(D2)) == TRUE)) {
506
+    j <- which(colnames(D3) %in% colnames(D2)[i])
507
+    D3[, j] <- apply(D2[, which(colnames(D2) %in% colnames(D2)[i]), drop = FALSE], 1, median)
508
+}
509
+
510
+D2 <- D3
511
+
512
+colnames(D2) <- c(rownames(Rho), sample(rownames(Rho), ncol(D2)-nrow(Rho), replace = TRUE))
513
+
514
+converged <- 10
515
+
516
+start <- Sys.time()
517
+nempires <- nempi(D2, Gamma = Rho, full = TRUE, converged = converged)
518
+end <- Sys.time()
519
+
520
+print(end - start)
521
+
522
+ures <- nempires
523
+
524
+sum(ures$lls[2:length(ures$lls)] - ures$lls[1:(length(ures$lls)-1)] < 0)
525
+
526
+pdf("temp.pdf", width = 12, height = 6)
527
+epiNEM::HeatmapOP(ures$Gamma, bordercol = rgb(0,0,0,0), col = "RdBu")
528
+#plot(ures, edgewidth = 30)
529
+dev.off()
530
+
531
+pdf("temp.pdf", width = 9, height = 6)
532
+par(mfrow=c(2,3))
533
+plotConvergence(ures, type = "b", col = "blue")
534
+dev.off()
535
+
536
+source("~/Documents/nempi/R/nempi_main.r")
537
+D4 <- D2
538
+colnames(D4) <- apply(Rho, 2, function(x) {
539
+    Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_")
540
+    return(Sgenes)
541
+})
542
+
543
+## run on hpc cluster:
544
+
545
+path <- ""
546
+type <- "TCGA-BRCA"
547
+
548
+if (do == 1) {
549
+
550
+    library(e1071)
551
+
552
+    load(paste0(path, type, "_nempi.rda"))
553
+
554
+    D4 <- D2
555
+    colnames(D4) <- apply(Rho, 2, function(x) {
556
+        Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_")
557
+        return(Sgenes)
558
+    })
559
+
560
+    svmres <- classpi(D4, full = TRUE, method = "svm")
561
+
562
+    save(svmres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda"))
563
+
564
+}
565
+
566
+if (do == 2) {
567
+
568
+    library(nnet)
569
+
570
+    load(paste0(path, type, "_nempi.rda"))
571
+
572
+    D4 <- D2
573
+    colnames(D4) <- apply(Rho, 2, function(x) {
574
+        Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_")
575
+        return(Sgenes)
576
+    })
577
+
578
+    nnres <- classpi(D4, full = TRUE, method = "nnet", MaxNWts = 50000, size = 5) # takes forever
579
+
580
+    save(nnres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda"))
581
+
582
+}
583
+
584
+if (do == 3) {
585
+
586
+    library(CALIBERrfimpute)
587
+
588
+    load(paste0(path, type, "_nempi.rda"))
589
+
590
+    D4 <- D2
591
+    colnames(D4) <- apply(Rho, 2, function(x) {
592
+        Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_")
593
+        return(Sgenes)
594
+    })
595
+
596
+    mfdata <- cbind(as.data.frame(t(D4)), colnames(D4))
597
+    mfdata[which(mfdata == "", arr.ind = TRUE)] <- NA
598
+
599
+    micedata <- mfdata
600
+    colnames(micedata) <- paste0(LETTERS[1:ncol(micedata)], 1:ncol(micedata))
601
+    miceres <- mice(micedata, method = c(rep('rfcont', ncol(micedata)-1), 'rfcat'), m = 2, maxit = 2)
602
+
603
+    save(miceres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda"))
604
+
605
+}
606
+
607
+if (do == 4) {
608
+
609
+    library(e1071)
610
+
611
+    load(paste0(path, type, "_nempi.rda"))
612
+
613
+    D4 <- D2
614
+    colnames(D4) <- apply(Rho, 2, function(x) {
615
+        Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_")
616
+        return(Sgenes)
617
+    })
618
+
619
+    rfres <- classpi(D4, full = TRUE, method = "randomForest")
620
+
621
+    save(rfres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda"))
622
+
623
+}
624
+
625
+if (do == 5) {
626
+
627
+    library(e1071)
628
+
629
+    load(paste0(path, type, "_nempi.rda"))
630
+
631
+    D4 <- D2
632
+    colnames(D4) <- apply(Rho, 2, function(x) {
633
+        Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_")
634
+        return(Sgenes)
635
+    })
636
+
637
+    mfdata <- cbind(as.data.frame(t(D4)), colnames(D4))
638
+    mfdata[which(mfdata == "", arr.ind = TRUE)] <- NA
639
+    library(missForest)
640
+    mfimp <- missForest(mfdata)
641
+    D4 <- D2
642
+    colnames(D4) <- mfimp$ximp[, ncol(mfimp$ximp)]
643
+    tmp <- mynem(D4, multi = TRUE)
644
+    Gamma <- getGamma(D4)
645
+    ures <- list()
646
+    ures$Gamma <- apply(Gamma, 2, function(x) return(x/sum(x)))
647
+    ures$res <- list()
648
+    ures$res$adj <- tmp$adj
649
+    ures$null <- TRUE
650
+    ures$combi <- 1
651
+    mfres <- ures
652
+
653
+    save(mfres, file = paste0("temp_", do, "_", as.numeric(Sys.time()), ".rda"))
654
+
655
+}
656
+
657
+## knn
658
+
659
+library(class)
660
+train <- t(D4[, which(colnames(D4) != "")])
661
+test <- t(D4[, which(colnames(D4) == "")])
662
+knn0 <- 0
663
+if (knn0) {
664
+    train <- rbind(train, NULL = rep(0, ncol(train)))
665
+    cl <- c(colnames(D4)[which(colnames(D4) != "")], "NULL")
666
+} else {
667
+    cl <- colnames(D4)[which(colnames(D4) != "")]
668
+}
669
+knnres <- knn(train, test, cl, prob = TRUE)
670
+D3 <- D4
671
+colnames(D3)[which(colnames(D3) %in% "")] <- as.character(knnres)
672
+tmp <- mynem(D3, multi = TRUE)
673
+Gamma <- getGamma(D3)
674
+ures <- list()
675
+ures$Gamma <- Gamma # apply(Gamma, 2, function(x) return(x/sum(x)))
676
+ures$res <- list()
677
+ures$res$adj <- tmp$adj
678
+ures$null <- TRUE
679
+ures$combi <- 1
680
+
681
+knnres <- ures
682
+
683
+## save(nempires, knnres, rfres, mfres, svmres, nnres, Rho, D2, Pmut, Pmeth, Pcnv, file = paste0(path, type, "_nempi.rda"))
684
+
685
+path <- "mutclust/"; type <- "TCGA-BRCA"
686
+
687
+load(paste0(path, type, "_nempi.rda"))
688
+
689
+## ## load("~/Mount/Euler/temp_1_1573814974.40659.rda") # old
690
+
691
+## load("~/Mount/Euler/temp_1_1574076694.65703.rda")
692
+
693
+## ## load("~/Mount/Euler/temp_4_1573819581.9749.rda") # old
694
+
695
+## load("~/Mount/Euler/temp_4_1574080528.67352.rda")
696
+
697
+## ## load("~/Mount/Euler/temp_2_1573821112.2412.rda") # old
698
+
699
+## load("~/Mount/Euler/temp_2_1574084503.12431.rda")
700
+
701
+## load("~/Mount/Euler/temp_5_1574081415.91547.rda")
702
+
703
+## ures <- rfres
704
+
705
+## ures <- mfres
706
+
707
+## ures <- svmres
708
+
709
+## ures <- nnres
710
+
711
+## ures <- knnres
712
+
713
+ures <- nempires
714
+
715
+## check against methylation and cnvs:
716
+
717
+pdf("nempi_gamma.pdf", width = 12, height = 6)
718
+tmp <- ures$Gamma
719
+colnames(tmp) <- NULL
720
+epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL)
721
+dev.off()
722
+
723
+pdf("nempi_phi.pdf", width = 6, height = 6)
724
+Pgenes <- sort(unique(colnames(D2)))
725
+adjtmp <- ures$res$adj
726
+colnames(adjtmp) <- rownames(adjtmp) <- Pgenes
727
+plotDnf(adjtmp, edgelwd = 2)
728
+dev.off()
729
+
730
+## cnv/meth enrichment:
731
+
732
+methods <- list("NEM$\\pi$" = nempires, knn = knnres, mf = mfres, nn = nnres, rf = rfres, svm = svmres)
733
+
734
+mutinc <- 1
735
+
736
+Lall <- Lcnv <- Lmeth <- Lmut <- list()
737
+
738
+for (i in 1:length(methods)) {
739
+    if (i != 8) {
740
+        print(names(methods)[i])
741
+        ures <- methods[[i]]
742
+        newGamma <- ures$Gamma
743
+    } else {
744
+        print("random")
745
+        newGamma <- newGamma*0
746
+        newGamma[sample(1:length(newGamma), floor(0.45*length(newGamma)))] <- 1 # well that is included into the test...
747
+    }
748
+    hist(newGamma)
749
+    if (i == 1) {
750
+        rntmp <- rownames(newGamma); newGamma <- t(mytc(ures$res$adj))%*%newGamma; rownames(newGamma) <- rntmp
751
+    }
752
+    P <- Pmut+Pmeth+Pcnv
753
+    if (!mutinc) { # include mutations or not (0)
754
+        P[which(Pmut == 1)] <- 0
755
+    }
756
+    P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))]
757
+    P <- P[order(rownames(P)), order(colnames(P))]
758
+    P[which(P > 1)] <- 1
759
+    Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P)))))
760
+    colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))])
761
+    P <- Ptmp[, colnames(Rho)]
762
+    ## fisher:
763
+    if (i %in% c(4,5)) {
764
+        cut <- 0.07
765
+    } else if (i == 6) {
766
+        cut <- 0.1
767
+    } else {
768
+        cut <- 1/8
769
+    }
770
+    newGamma[which(newGamma > cut)] <- 1
771
+    newGamma[which(newGamma <= cut)] <- 0
772
+    pmeth <- newGamma
773
+    pmeth[which(newGamma == 1 & P == 1)] <- 2
774
+    pmeth[which(newGamma == 0 & P == 1)] <- -2
775
+    if (!mutinc) { # include mutations or not (0)
776
+        pmeth[which(Rho > 0)] <- 0
777
+    }
778
+    colnames(pmeth) <- NULL
779
+    ##pdf(paste0("FigS_", names(methods)[i], ".pdf"), height = 6, width = 12)
780
+    setEPS()
781
+    postscript(paste0("FigS_", names(methods)[i], ".eps"), height = 6, width = 12)
782
+    print(epiNEM::HeatmapOP(pmeth, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL))
783
+    dev.off()
784
+    print("cnv")
785
+    P <- Pcnv
786
+    P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))]
787
+    P <- P[order(rownames(P)), order(colnames(P))]
788
+    P[which(P > 1)] <- 1
789
+    Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P)))))
790
+    colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))])
791
+    P <- Ptmp[, colnames(Rho)]
792
+    F <- matrix(c(sum(pmeth >= 1 & P == 1), sum(pmeth >= 1 & P == 0), sum(pmeth == -2 & P == 1), sum(pmeth == 0 & P == 0)), 2)
793
+    print(1 - phyper(F[1,1]-1, sum(F[, 1]), sum(F[, 2]), sum(F[1, ])))
794
+    Lcnv[[i]] <- F
795
+    print("meth")
796
+    P <- Pmeth
797
+    P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))]
798
+    P <- P[order(rownames(P)), order(colnames(P))]
799
+    P[which(P > 1)] <- 1
800
+    Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P)))))
801
+    colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))])
802
+    P <- Ptmp[, colnames(Rho)]
803
+    F <- matrix(c(sum(pmeth >= 1 & P == 1), sum(pmeth >= 1 & P == 0), sum(pmeth == -2 & P == 1), sum(pmeth == 0 & P == 0)), 2)
804
+    print(1 - phyper(F[1,1]-1, sum(F[, 1]), sum(F[, 2]), sum(F[1, ])))
805
+    Lmeth[[i]] <- F
806
+    print("mut")
807
+    P <- Pmut
808
+    P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))]
809
+    P <- P[order(rownames(P)), order(colnames(P))]
810
+    P[which(P > 1)] <- 1
811
+    Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P)))))
812
+    colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))])
813
+    P <- Ptmp[, colnames(Rho)]
814
+    F <- matrix(c(sum(pmeth >= 1 & P == 1), sum(pmeth >= 1 & P == 0), sum(pmeth == -2 & P == 1), sum(pmeth == 0 & P == 0)), 2)
815
+    print(1 - phyper(F[1,1]-1, sum(F[, 1]), sum(F[, 2]), sum(F[1, ])))
816
+    Lmut[[i]] <- F
817
+    Fmat <- matrix(c(sum(pmeth == 2), sum(pmeth == 1), sum(pmeth == -2), sum(pmeth == 0)), 2)
818
+    Lall[[i]] <- Fmat
819
+    ## print(fisher.test(Fmat, alternative = "greater"))
820
+    print("p-value")
821
+    print(1 - phyper(Fmat[1,1]-1, sum(Fmat[, 1]), sum(Fmat[, 2]), sum(Fmat[1, ])))
822
+}
823
+
824
+## create tables:
825
+
826
+for (i in 1:length(methods)) {
827
+    cat(paste0(names(methods)[i], " & ", Lcnv[[i]][1,1], " & ", Lcnv[[i]][2,1], " & ", Lcnv[[i]][2,2], " & ", Lcnv[[i]][1,2], "\\\\\n"))
828
+}
829
+
830
+for (i in 1:length(methods)) {
831
+    cat(paste0(names(methods)[i], " & ", Lmeth[[i]][1,1], " & ", Lmeth[[i]][2,1], " & ", Lmeth[[i]][2,2], " & ", Lmeth[[i]][1,2], "\\\\\n"))
832
+}
833
+
834
+for (i in 1:length(methods)) {
835
+    cat(paste0(names(methods)[i], " & ", Lmut[[i]][1,1], " & ", Lmut[[i]][2,1], " & ", Lmut[[i]][2,2], " & ", Lmut[[i]][1,2], "\\\\\n"))
836
+}
837
+
838
+for (i in 1:length(methods)) {
839
+    if (names(methods)[i] == "nn") { next() }
840
+    Fmat <- Lall[[i]]
841
+    ptmp <- 1 - phyper(Fmat[1,1]-1, sum(Fmat[, 1]), sum(Fmat[, 2]), sum(Fmat[1, ]))
842
+    if (ptmp == 0) {
843
+        ptmp <- "$< 2.2\\times10^{-16}$"
844
+    } else {
845
+        ptmp <- paste0("$", signif(ptmp), "$")
846
+    }
847
+    cat(paste0(names(methods)[i], " & ", Lall[[i]][1,1], " & ", Lall[[i]][2,1], " & ", Lall[[i]][2,2], " & ", Lall[[i]][1,2], " & ", ptmp, "\\\\\n"))
848
+}
849
+
850
+
851
+## check correlation
852
+
853
+## P4 <- apply(P3, 2, function(x) return(x/sum(x)))
854
+## P4[is.na(P4)] <- 0
855
+
856
+## cor(as.vector(newGamma), as.vector(P3))
857
+
858
+##
859
+
860
+cormat <- matrix(0, nrow(pmeth), 2)
861
+fishres <- numeric(nrow(pmeth))
862
+names(fishres) <- rownames(pmeth)
863
+for (i in 1:nrow(pmeth)) {
864
+    Fmat <- matrix(c(sum(pmeth[i, ] == 2), sum(pmeth[i, ] == 1), sum(pmeth[i, ] == -2), sum(pmeth[i, ] == 0)), 2)
865
+    fishres[i] <- fisher.test(Fmat, alternative = "g")$p.value
866
+    cormat[i, ] <- c(sum(Fmat[1, ]), sum(Fmat[, 1]))
867
+}
868
+
869
+## GATA3 & PTPRD:
870
+
871
+Fmat <- matrix(c(sum(pmeth[3, ] %in% c(2,-2) & pmeth[7, ] %in% c(2,-2)),
872
+                 sum(pmeth[3, ] %in% c(1,0) & pmeth[7, ] %in% c(-2,2)),
873
+                 sum(pmeth[3, ] %in% c(-2,2) & pmeth[7, ] %in% c(1,0)),
874
+                 sum(pmeth[3, ] %in% c(1,0) & pmeth[7, ] %in% c(1,0))), 2)
875
+
876
+1 - phyper(Fmat[1,1]-1, sum(Fmat[, 1]), sum(Fmat[, 2]), sum(Fmat[1, ]))
877
+
878
+
879
+## pca:
880
+
881
+pca <- princomp(D2)
882
+
883
+col <- apply(newGamma, 2, sum)
884
+
885
+col <- col/max(col)
886
+
887
+K <- kmeans(t(newGamma), nrow(newGamma))
888
+
889
+plot(pca$loadings[, 1:2], col = K$cluster)#rgb(col,0,0,1))
890
+
891
+## tsne:
892
+
893
+sne <- tsne(t(newGamma))
894
+
895
+plot(sne, col = K$cluster)
896
+
897
+## R profiling:
898
+
899
+Rprof("temp.txt", line.profiling=TRUE)
900
+ures <- nempi(D2[1:20, ], Gamma = Gamma, complete = 1, full = TRUE, converged = converged, combi = combi)
901
+Rprof(NULL)
902
+summaryRprof("temp.txt", lines = "show")$sampling.time
903
+head(summaryRprof("temp.txt", lines = "show")$by.self, 10)
904
+
905
+##
906
+
907
+type <- "TCGA-BRCA"
908
+load(paste0(path, type, "_nempi.rda"))
909
+
910
+pdf("Fig5.pdf", width = 10, height = 10)
911
+plot(uresn, edgelwd = 2)
912
+dev.off()
913
+
914
+pdf("Fig6.pdf", width = 10, height = 5)
915
+tmp <- uresn$Gamma
916
+colnames(tmp) <- NULL
917
+epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL)
918
+dev.off()
919
+
920
+pdf("Fig7.pdf", width = 10, height = 10)
921
+phitmp <- mytc(uresn$res$adj)
922
+tmp <- t(phitmp)%*%uresn$Gamma
923
+colnames(tmp) <- NULL
924
+rownames(tmp) <- rownames(uresn$Gamma)
925
+tmp2 <- tmp
926
+colnames(tmp) <- NULL
927
+tmp <- Gamma
928
+colnames(tmp) <- NULL
929
+tmp3 <- tmp
930
+tmp4 <- tmp2
931
+p1 <- epiNEM::HeatmapOP(tmp2, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL)
932
+p2 <- epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL)
933
+print(p1, position=c(0, .5, 1, 1), more=TRUE)
934
+print(p2, position=c(0, 0, 1, .5))
935
+sum(tmp == 1 & tmp2 == 1)/sum(tmp == 1)
936
+dev.off()
937
+
938
+pdf("Fig8.pdf", width = 10, height = 10)
939
+plot(uresf, edgelwd = 2)
940
+dev.off()
941
+
942
+pdf("Fig9.pdf", width = 10, height = 5)
943
+tmp <- uresf$Gamma
944
+colnames(tmp) <- NULL
945
+epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL)
946
+dev.off()
947
+
948
+pdf("Fig10.pdf", width = 10, height = 10)
949
+phitmp <- mytc(uresf$res$adj)
950
+tmp <- t(phitmp)%*%uresf$Gamma
951
+colnames(tmp) <- NULL
952
+rownames(tmp) <- rownames(uresf$Gamma)
953
+tmp2 <- tmp
954
+colnames(tmp) <- NULL
955
+tmp <- Gamma
956
+colnames(tmp) <- NULL
957
+p1 <- epiNEM::HeatmapOP(tmp2, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL)
958
+p2 <- epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL)
959
+print(p1, position=c(0, .5, 1, 1), more=TRUE)
960
+print(p2, position=c(0, 0, 1, .5))
961
+sum(tmp == 1 & tmp2 == 1)/sum(tmp == 1)
962
+dev.off()
963
+
964
+## new figure:
965
+
966
+P <- t(mytc(uresf$res$adj))%*%uresf$Gamma
967
+rownames(P) <- rownames(uresf$Gamma)
968
+PM <- P
969
+PM[which(PM > 1/6 & Gamma == 1)] <- P[which(PM > 1/6 & Gamma == 1)] + 1
970
+PM[which(PM <= 1/6 & Gamma == 1)] <- P[which(PM <= 1/6 & Gamma == 1)] - 1
971
+epiNEM::HeatmapOP(PM, bordercol = rgb(0,0,0,0), col = "RdYlBu", breaks = seq(-1,2,length.out=5), clusterx = tmp2)
972
+
973
+## other plots:
974
+
975
+pdf("temp.pdf", width = 10, height = 10)
976
+plotDnf(c("M1=M4", "M2=M4", "M3=M5", "M1=M5"), edgelwd = 2)
977
+dev.off()
978
+
979
+M <- matrix(0, 5, 10)
980
+rownames(M) <- paste0("M", 1:5)
981
+colnames(M) <- paste0("S", 1:10)
982
+
983
+M[1, 1:4] <- 1
984
+M[2, c(2,7:9)] <- 1
985
+M[3, 5:6] <- 1
986
+M[3, 10] <- 1
987
+
988
+phi <- matrix(0, 5, 5)
989
+diag(phi) <- 1
990
+phi[1, 1:5] <- 1
991
+phi[2, 3] <- phi[4, 5] <- 1
992
+
993
+## M <- t(phi)%*%M; M[M > 1] <- 1
994
+
995
+rownames(M) <- paste0("M", 1:5)
996
+
997
+pdf("temp.pdf", width = 8, height = 4)
998
+epiNEM::HeatmapOP(M, Colv = 0, Rowv = 0, col = "RdBu", colorkey = NULL)
999
+dev.off()
1000
+
1001
+## check for mutation type (das f├╝hrt zu nichts)
1002
+
1003
+colnames(Mtype) <- unlist(lapply(strsplit(colnames(Mtype), "-"), function(x) {
1004
+    y <- x[1:3]
1005
+    y <- paste(c(y, "01"), collapse = "-")
1006
+    return(y)
1007
+    }))
1008
+
1009
+checkgene <- "CBFB"
1010
+
1011
+van <- intersect(colnames(Gamma)[which(PM[checkgene, ] < 0)], colnames(Mtype))
1012
+
1013
+A <- sum(unlist(lapply(strsplit(Mtype[checkgene, van], " "), function(x) if ("Silent" %in% x) { return(1) } else { return(0) })))
1014
+B <- length(van) - A
1015
+
1016
+van <- intersect(colnames(Gamma)[which(PM[checkgene, ] > 0)], colnames(Mtype))
1017
+
1018
+C <- sum(unlist(lapply(strsplit(Mtype[checkgene, van], " "), function(x) if ("Silent" %in% x) { return(1) } else { return(0) })))
1019
+D <- length(van) - C
1020
+
1021
+table(unlist(lapply(strsplit(Mtype[checkgene, van], " "), function(x) return(names(table(x))[which.max(table(x))]))))
1022
+
1023
+## table(unlist(strsplit(as.vector(Mtype), " ")))
1024
+
1025
+## pdf("Fig10.pdf", width = 10, height = 10)
1026
+## p1 <- epiNEM::HeatmapOP(tmp4, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL)
1027
+## p2 <- epiNEM::HeatmapOP(tmp3, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL)
1028
+## p3 <- epiNEM::HeatmapOP(tmp2, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL)
1029
+## p4 <- epiNEM::HeatmapOP(tmp, bordercol = rgb(0,0,0,0), col = "RdBu", clusterx = tmp2, colorkey = NULL)
1030
+## print(p1, position=c(0, .5, .5, 1), more=TRUE)
1031
+## print(p2, position=c(0, 0, .5, .5), more=TRUE)
1032
+## print(p3, position=c(.5, .5, 1, 1), more=TRUE)
1033
+## print(p4, position=c(.5, 0, 1, .5))
1034
+## sum(tmp == 1 & tmp2 == 1)/sum(tmp == 1)
1035
+## dev.off()
1036
+
1037
+source("~/Documents/testing/R/nempi.r")
1038
+source("~/Documents/testing/R/nempi_low.r")
1039
+
1040
+bsres <- unembs(D2, Gamma = Gamma, complete = 1, full = TRUE, converged = converged, combi = combi, bsruns = 10, bssize = 0.5)
1041
+
1042
+pdf("temp.pdf", width = 10, height = 5)
1043
+epiNEM::HeatmapOP(bsres$Gamma, bordercol = rgb(0,0,0,0), col = "RdBu", colorkey = NULL)
1044
+dev.off()
1045
+
1046
+pdf("temp.pdf", width = 20, height = 10)
1047
+par(mfrow=c(1,2))
1048
+tmp <- bsres$phi
1049
+tmp[which(tmp > 0)] <- 1
1050
+diag(tmp) <- 0
1051
+freqtop <- as.vector(t(bsres$phi)[which(lower.tri(bsres$phi) == TRUE)])/10
1052
+freqtop <- freqtop[which(freqtop != 0)]
1053
+tmptop <- tmp
1054
+tmptop[lower.tri(tmptop)] <- 0
1055
+tmptop <- adj2dnf(tmptop)
1056
+tmptop <- tmptop[grep("=", tmptop)]
1057
+plotDnf(tmptop, edgelwd = 2, edgelab = freqtop, freq = freqtop)
1058
+freqbot <- as.vector(t(bsres$phi)[which(upper.tri(bsres$phi) == TRUE)])/10
1059
+freqbot <- freqbot[which(freqbot != 0)]
1060
+tmpbot <- tmp
1061
+tmpbot[upper.tri(tmpbot)] <- 0
1062
+tmpbot <- adj2dnf(tmpbot)
1063
+tmpbot <- tmpbot[grep("=", tmpbot)]
1064
+plotDnf(tmpbot, edgelwd = 2, edgelab = freqbot, freq = freqbot)
1065
+dev.off()
1066
+
1067
+##
1068
+
1069
+source("testing/vignettes/TCGA_cluster.r")
1070
+
1071
+pcares <- prcomp(tmp)
1072
+
1073
+library(naturalsort)
1074
+library(nem)
1075
+library(cluster)
1076
+library(Rcpp)
1077
+
1078
+source("~/Documents/mnem/R/mnems.r")
1079
+source("~/Documents/mnem/R/mnems_low.r")
1080
+sourceCpp("~/Documents/mnem/src/mm.cpp")
1081
+
1082
+res <- mnem(tmp, starts = 10, search = "greedy", type = "cluster3", complete = 1, multi = 1)
1083
+
1084
+tmp2 <- tmp
1085
+Rprof("temp.txt", line.profiling=TRUE)
1086
+res <- mnem(tmp2, starts = 10, search = "greedy", type = "cluster3", complete = 1, multi = 1, k = 2)
1087
+Rprof(NULL)
1088
+summaryRprof("temp.txt", lines = "show")$sampling.time
1089
+head(summaryRprof("temp.txt", lines = "show")$by.self)
1090
+
1091
+## resk <- mnemk(tmp2, starts = 10, search = "estimate", type = "cluster3", complete = 1, multi = 1)
1092
+
1093
+cluster <- apply(getAffinity(res$probs, mw = res$mw, complete = TRUE), 2, which.max)
1094
+
1095
+par(mfrow=c(1,2))
1096
+plot(pcares$rotation[, 1:2], col = cluster)
1097
+
1098
+names(cluster) <- gsub("-01$|-03$", "", colnames(M))
1099
+
1100
+cluster <- cluster[which(names(cluster) %in% clinical$submitter_id)]
1101
+
1102
+cluster <- cluster[order(names(cluster))]
1103
+
1104
+clinical <- rbind(clinical, clinical[which(clinical[, 1] %in% names(cluster)[which(duplicated(names(cluster)))]), ])
1105
+
1106
+clinical <- clinical[order(clinical[, 1]), ]
1107
+
1108
+print(all(clinical[, 1] == names(cluster)))
1109
+
1110
+fit <- survfit(Surv(days_to_death, vital_status) ~ cluster, clinical)
1111
+plot(fit, col = 1:length(table(cluster)), lty = 1:length(table(cluster)))
1112
+legend(max(clinical$days_to_death, na.rm = TRUE), 1, 1:length(table(cluster)), lty = 1:length(table(cluster)), col = 1:length(table(cluster)), xjust = 1, yjust = 1)
1113
+
1114
+fit <- coxph(Surv(days_to_death, vital_status) ~ cluster + age_at_diagnosis, clinical)
1115
+print(fit)
1116
+
1117
+fit <- coxph(Surv(days_to_death, vital_status) ~ cluster + age_at_diagnosis + tumor_stage, clinical)
1118
+print(fit)
1119
+
1120
+fit <- coxph(Surv(days_to_death, vital_status) ~ cluster + tumor_stage, clinical)
1121
+print(fit)
1122
+
1123
+fit <- coxph(Surv(days_to_death, vital_status) ~ cluster, clinical)
1124
+print(fit)
1125
+
1126
+kres <- clustNEM(tmp, nem = 0, k = length(res$comp), nstart = 10)
1127
+
1128
+## kres <- clustNEM(tmp, nem = 0, nstart = 10)
1129
+
1130
+plot(pcares$rotation[, 1:2], col = kres$cluster)
1131
+
1132
+kcluster <- kres$cluster
1133
+
1134
+names(kcluster) <- gsub("-01$|-03$", "", colnames(M))
1135
+
1136
+kcluster <- kcluster[which(names(kcluster) %in% clinical$submitter_id)]
1137
+
1138
+kcluster <- kcluster[order(names(kcluster))]
1139
+
1140
+fit <- survfit(Surv(days_to_death, vital_status) ~ kcluster, clinical)
1141
+plot(fit, col = 1:length(table(cluster)), lty = 1:length(table(cluster)))
1142
+legend(max(clinical$days_to_death, na.rm = TRUE), 1, 1:length(table(cluster)), lty = 1:length(table(cluster)), col = 1:length(table(cluster)), xjust = 1, yjust = 1)
1143
+
1144
+fit <- coxph(Surv(days_to_death, vital_status) ~ kcluster + age_at_diagnosis, clinical)
1145
+print(fit)
1146
+
1147
+fit <- coxph(Surv(days_to_death, vital_status) ~ kcluster, clinical)
1148
+print(fit)
1149
+
1150
+
1151
+
1152
+
1153
+
1154
+
1155
+
1156
+
1157
+
1158
+
1159
+
1160
+
1161
+
1162
+
1163
+
1164
+
1165
+