... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
Package: GRaNIE |
2 | 2 |
Title: GRaNIE: Reconstruction cell type specific gene regulatory networks including enhancers using chromatin accessibility and RNA-seq data |
3 |
-Version: 0.99.3 |
|
3 |
+Version: 0.99.4 |
|
4 | 4 |
Encoding: UTF-8 |
5 | 5 |
Authors@R: c(person("Christian", "Arnold", email = |
6 | 6 |
"chrarnold@web.de", role = c("cre","aut")), |
... | ... |
@@ -370,23 +370,51 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
370 | 370 |
} |
371 | 371 |
|
372 | 372 |
|
373 |
-.loadPrecompiledAnnotationData <- function(genomeAssembly) { |
|
374 |
- |
|
375 |
- futile.logger::flog.info(paste0("Reading pre-compiled genome annotation data ")) |
|
376 |
- |
|
377 |
- if (is.null(geneAnnotation[[genomeAssembly]])) { |
|
373 |
+.retrieveAnnotationData <- function(genomeAssembly) { |
|
378 | 374 |
|
379 |
- message = "The genome version you specified is not contained in the pre-compiled genome annotation list. Currently, only hg19, hg38 and mm10 are supported. Contact us." |
|
380 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
375 |
+ futile.logger::flog.info(paste0("Retrieving genome annotation data from biomaRt... This may take a while")) |
|
376 |
+ |
|
377 |
+ params.l = .getBiomartParameters(genomeAssembly) |
|
378 |
+ |
|
379 |
+ columnsToRetrieve = c("chromosome_name", "start_position", "end_position", |
|
380 |
+ "strand", "ensembl_gene_id", "gene_biotype", "external_gene_name") |
|
381 |
+ |
|
382 |
+ geneAnnotation <- NULL |
|
383 |
+ attempt <- 0 |
|
384 |
+ mirrors = c('www', 'uswest', 'useast', 'asia') |
|
385 |
+ while(!is.data.frame(geneAnnotation) && attempt <= 3 ) { |
|
386 |
+ attempt <- attempt + 1 |
|
387 |
+ geneAnnotation = tryCatch({ |
|
388 |
+ ensembl = biomaRt::useEnsembl(biomart = "genes", host = params.l[["host"]], dataset = params.l[["dataset"]], mirror = mirrors[attempt]) |
|
389 |
+ biomaRt::getBM(attributes = columnsToRetrieve, mart = ensembl) |
|
390 |
+ |
|
391 |
+ } |
|
392 |
+ ) |
|
393 |
+ } |
|
394 |
+ |
|
395 |
+ |
|
396 |
+ if (!is.data.frame(geneAnnotation)) { |
|
397 |
+ |
|
398 |
+ error_Biomart = "A temporary error occured with biomaRt::getBM or biomaRt::useEnsembl. This is often caused by an unresponsive Ensembl site. Try again at a later time. Note that this error is not caused by GRaNIE but external services." |
|
399 |
+ .checkAndLogWarningsAndErrors(NULL, error_Biomart, isWarning = FALSE) |
|
400 |
+ return(NULL) |
|
401 |
+ |
|
402 |
+ } |
|
403 |
+ |
|
404 |
+ geneAnnotation %>% |
|
405 |
+ as_tibble() %>% |
|
406 |
+ dplyr::filter(stringr::str_length(chromosome_name) <= 5) %>% |
|
407 |
+ dplyr::mutate(chromosome_name = paste0("chr", chromosome_name)) %>% |
|
408 |
+ dplyr::rename(gene.chr = chromosome_name, gene.start = start_position, gene.end = end_position, |
|
409 |
+ gene.strand = strand, gene.ENSEMBL = ensembl_gene_id, gene.type = gene_biotype, gene.name = external_gene_name) %>% |
|
410 |
+ dplyr::mutate_if(is.character, as.factor) %>% |
|
411 |
+ dplyr::mutate(gene.strand = factor(gene.strand, levels = c(1,-1), labels = c("+", "-"))) |
|
381 | 412 |
|
382 |
- } |
|
383 |
- |
|
384 |
- geneAnnotation[[genomeAssembly]] |
|
385 | 413 |
} |
386 | 414 |
|
387 | 415 |
.getAllGeneTypesAndFrequencies <- function(genomeAssembly, verbose = TRUE) { |
388 | 416 |
|
389 |
- geneAnnotation.df = .loadPrecompiledAnnotationData(genomeAssembly) |
|
417 |
+ geneAnnotation.df = .retrieveAnnotationData(genomeAssembly) |
|
390 | 418 |
return(table(geneAnnotation.df$gene.type)) |
391 | 419 |
|
392 | 420 |
} |
... | ... |
@@ -399,7 +427,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
399 | 427 |
stop("Not yet implemented") |
400 | 428 |
} |
401 | 429 |
|
402 |
- genes.filt.df = .loadPrecompiledAnnotationData(GRN@config$parameters$genomeAssembly) |
|
430 |
+ genes.filt.df = .retrieveAnnotationData(GRN@config$parameters$genomeAssembly) |
|
403 | 431 |
|
404 | 432 |
if (!is.null(gene.types)) { |
405 | 433 |
if (! "all" %in% gene.types) { |
... | ... |
@@ -511,9 +539,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
511 | 539 |
rowMedians_rna = matrixStats::rowMedians(countsRNA.m) |
512 | 540 |
CV_rna = matrixStats::rowSds(countsRNA.m) / rowMeans_rna |
513 | 541 |
|
514 |
- # This object is internally loaded and lives inside the R directory within sysdata.rda |
|
515 |
- # load("R/sysdata.rda") |
|
516 |
- genomeAnnotation.df = geneAnnotation[[GRN@config$parameters$genomeAssembly]] |
|
542 |
+ genomeAnnotation.df = .retrieveAnnotationData(GRN@config$parameters$genomeAssembly) |
|
517 | 543 |
|
518 | 544 |
metadata_rna = tibble::tibble(gene.ENSEMBL = countsRNA.clean$ENSEMBL, |
519 | 545 |
gene.mean = rowMeans_rna, |
... | ... |
@@ -2431,11 +2457,11 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
2431 | 2457 |
end(subject.gr) = start(subject.gr) |
2432 | 2458 |
} |
2433 | 2459 |
|
2434 |
- overlapsAll = GenomicRanges::findOverlaps(query, subject.gr, |
|
2460 |
+ overlapsAll = suppressWarnings(GenomicRanges::findOverlaps(query, subject.gr, |
|
2435 | 2461 |
minoverlap=1, |
2436 | 2462 |
type="any", |
2437 | 2463 |
select="all", |
2438 |
- ignore.strand=TRUE) |
|
2464 |
+ ignore.strand=TRUE)) |
|
2439 | 2465 |
|
2440 | 2466 |
query_row_ids = S4Vectors::queryHits(overlapsAll) |
2441 | 2467 |
subject_rowids = S4Vectors::subjectHits(overlapsAll) |
... | ... |
@@ -2596,7 +2622,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
2596 | 2622 |
# if a peak overlaps with a gene, should the same gene be reported as the connection? |
2597 | 2623 |
|
2598 | 2624 |
# OVERLAP OF PEAKS AND EXTENDED GENES |
2599 |
- overlaps.sub.df = .calculatePeakGeneOverlaps(GRN, consensusPeaks, peak.TADs.df, neighborhoodSize = neighborhoodSize, |
|
2625 |
+ overlaps.sub.df = .calculatePeakGeneOverlaps(GRN, allPeaks = consensusPeaks, peak.TADs.df, neighborhoodSize = neighborhoodSize, |
|
2600 | 2626 |
genomeAssembly = genomeAssembly, gene.types = gene.types, overlapTypeGene = overlapTypeGene) |
2601 | 2627 |
|
2602 | 2628 |
|
... | ... |
@@ -4388,3 +4414,110 @@ changeOutputDirectory <- function(GRN, outputDirectory = ".") { |
4388 | 4414 |
|
4389 | 4415 |
} |
4390 | 4416 |
|
4417 |
+ |
|
4418 |
+.createTables_peakGeneQC <- function(peakGeneCorrelations.all.cur, networkType_details, colors_vec, range) { |
|
4419 |
+ |
|
4420 |
+ d = peakGeneCorrelations.all.cur %>% |
|
4421 |
+ dplyr::group_by(r_positive, class, peak_gene.p.raw.class) %>% |
|
4422 |
+ dplyr::summarise(n = dplyr::n()) %>% |
|
4423 |
+ dplyr::ungroup() |
|
4424 |
+ |
|
4425 |
+ # Some classes might be missing, add them here with explicit zeros |
|
4426 |
+ for (r_pos in c(TRUE, FALSE)) { |
|
4427 |
+ for (classCur in networkType_details){ |
|
4428 |
+ for (pclassCur in levels(peakGeneCorrelations.all.cur$peak_gene.p.raw.class)) { |
|
4429 |
+ |
|
4430 |
+ row = which(d$r_positive == r_pos & d$class == classCur & as.character(d$peak_gene.p.raw.class) == as.character(pclassCur)) |
|
4431 |
+ if (length(row) == 0) { |
|
4432 |
+ d = tibble::add_row(d, r_positive = r_pos, class = classCur, peak_gene.p.raw.class = pclassCur, n = 0) |
|
4433 |
+ } |
|
4434 |
+ } |
|
4435 |
+ } |
|
4436 |
+ } |
|
4437 |
+ |
|
4438 |
+ # Restore the "ordered" factor for class |
|
4439 |
+ d$peak_gene.p.raw.class = factor(d$peak_gene.p.raw.class, ordered = TRUE, levels = levels(peakGeneCorrelations.all.cur$peak_gene.p.raw.class)) |
|
4440 |
+ |
|
4441 |
+ |
|
4442 |
+ # Normalization factors |
|
4443 |
+ dsum = d %>% |
|
4444 |
+ dplyr::group_by(r_positive, class) %>% |
|
4445 |
+ dplyr::summarise(sum_n = sum(.data$n)) |
|
4446 |
+ |
|
4447 |
+ |
|
4448 |
+ # Summarize per bin |
|
4449 |
+ d2 = d %>% |
|
4450 |
+ dplyr::group_by(class, peak_gene.p.raw.class)%>% |
|
4451 |
+ dplyr::summarise(sum_pos = .data$n[r_positive], |
|
4452 |
+ sum_neg = .data$n[!r_positive]) %>% |
|
4453 |
+ dplyr::mutate(ratio_pos_raw = sum_pos / sum_neg, |
|
4454 |
+ enrichment_pos = sum_pos / (sum_pos + sum_neg), |
|
4455 |
+ ratio_neg = 1 - enrichment_pos) %>% |
|
4456 |
+ dplyr::ungroup() |
|
4457 |
+ |
|
4458 |
+ # Compare between real and permuted |
|
4459 |
+ normFactor_real = dplyr::filter(dsum, class == !! (networkType_details[1])) %>% dplyr::pull(sum_n) %>% sum() / |
|
4460 |
+ dplyr::filter(dsum, class == !! (networkType_details[2])) %>% dplyr::pull(sum_n) %>% sum() |
|
4461 |
+ |
|
4462 |
+ # ratio_norm not used currently, no normalization necessary here or not even useful because we dont want to normalize the r_pos and r_neg ratios: These are signal in a way. Only when comparing between real and permuted, we have to account for sample size for corrections |
|
4463 |
+ d3 = d %>% |
|
4464 |
+ dplyr::group_by(peak_gene.p.raw.class, r_positive) %>% |
|
4465 |
+ dplyr::summarise(n_real = .data$n[class == !! (names(colors_vec)[1]) ], |
|
4466 |
+ n_permuted = .data$n[class == !! (names(colors_vec)[2]) ]) %>% |
|
4467 |
+ dplyr::ungroup() %>% |
|
4468 |
+ dplyr::mutate(ratio_real_raw = n_real / n_permuted, |
|
4469 |
+ ratio_real_norm = ratio_real_raw / normFactor_real, |
|
4470 |
+ enrichment_real = n_real / (n_real + n_permuted), |
|
4471 |
+ enrichment_real_norm = (n_real / normFactor_real) / ((n_real / normFactor_real) + n_permuted)) |
|
4472 |
+ |
|
4473 |
+ |
|
4474 |
+ stopifnot(identical(levels(d2$peak_gene.p.raw.class), levels(d3$peak_gene.p.raw.class))) |
|
4475 |
+ # 2 enrichment bar plots but combined using facet_wrap |
|
4476 |
+ d2$set = "r+ / r-"; d3$set = "real / permuted" |
|
4477 |
+ d_merged <- tibble::tibble(peak_gene.p.raw.class = c(as.character(d2$peak_gene.p.raw.class), |
|
4478 |
+ as.character(d3$peak_gene.p.raw.class)), |
|
4479 |
+ ratio = c(d2$ratio_pos_raw, d3$ratio_real_norm), |
|
4480 |
+ classAll = c(as.character(d2$class), d3$r_positive), |
|
4481 |
+ set = c(d2$set, d3$set)) %>% |
|
4482 |
+ dplyr::mutate(classAll = factor(classAll, levels=c(paste0("real_",range), paste0("random_",range), "TRUE", "FALSE")), |
|
4483 |
+ peak_gene.p.raw.class = factor(peak_gene.p.raw.class, levels = levels(d2$peak_gene.p.raw.class))) |
|
4484 |
+ |
|
4485 |
+ d4 = tibble::tibble(peak_gene.p.raw.class = unique(d$peak_gene.p.raw.class), |
|
4486 |
+ n_rpos_real = NA_integer_, n_rpos_random = NA_integer_, |
|
4487 |
+ n_rneg_real = NA_integer_, n_rneg_random = NA_integer_, |
|
4488 |
+ ratio_permuted_real_rpos_norm = NA_real_, |
|
4489 |
+ ratio_permuted_real_rneg_norm = NA_real_) |
|
4490 |
+ |
|
4491 |
+ for (i in seq_len(nrow(d4))) { |
|
4492 |
+ row_d2 = which(d2$class == networkType_details[1] & d2$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i]) |
|
4493 |
+ stopifnot(length(row_d2) == 1) |
|
4494 |
+ d4[i, "n_rpos_real"] = d2[row_d2, "sum_pos"] %>% unlist() |
|
4495 |
+ d4[i, "n_rneg_real"] = d2[row_d2, "sum_neg"] %>% unlist() |
|
4496 |
+ row_d2 = which(d2$class == paste0("random_",range) & d2$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i]) |
|
4497 |
+ d4[i, "n_rpos_random"] = d2[row_d2, "sum_pos"] %>% unlist() |
|
4498 |
+ d4[i, "n_rneg_random"] = d2[row_d2, "sum_neg"] %>% unlist() |
|
4499 |
+ |
|
4500 |
+ row_d3 = which(d3$r_positive == TRUE & d3$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i]) |
|
4501 |
+ d4[i, "ratio_permuted_real_rpos_norm"] = 1- d3[row_d3, "ratio_real_norm"] %>% unlist() |
|
4502 |
+ row_d3 = which(d3$r_positive == FALSE & d3$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i]) |
|
4503 |
+ d4[i, "ratio_permuted_real_rneg_norm"] = 1- d3[row_d3, "ratio_real_norm"] %>% unlist() |
|
4504 |
+ } |
|
4505 |
+ |
|
4506 |
+ d4 = d4 %>% |
|
4507 |
+ dplyr::mutate(ratio_rneg_rpos_real = n_rneg_real / (n_rneg_real + n_rpos_real), |
|
4508 |
+ ratio_rneg_rpos_random = n_rneg_random / (n_rneg_random + n_rpos_random), |
|
4509 |
+ peak_gene.p.raw.class.bin = as.numeric(peak_gene.p.raw.class)) %>% |
|
4510 |
+ dplyr::arrange(peak_gene.p.raw.class.bin) |
|
4511 |
+ |
|
4512 |
+ d4_melt = reshape2::melt(d4, id = c("peak_gene.p.raw.class.bin", "peak_gene.p.raw.class")) %>% |
|
4513 |
+ dplyr::filter(grepl("ratio", variable)) |
|
4514 |
+ |
|
4515 |
+ |
|
4516 |
+ list(d = d, d2 = d2, d3 = d3, d4 = d4, d4_melt = d4_melt, d_merged = d_merged) |
|
4517 |
+ |
|
4518 |
+} |
|
4519 |
+ |
|
4520 |
+.classFreq_label <- function(tbl_freq) { |
|
4521 |
+ paste0(names(tbl_freq), " (", tbl_freq, ")") |
|
4522 |
+} |
|
4523 |
+ |
... | ... |
@@ -58,7 +58,7 @@ |
58 | 58 |
if (!checkmate::testNull(pdfFile)) |
59 | 59 |
grDevices::dev.off() |
60 | 60 |
|
61 |
- futile.logger::flog.info(paste0("Finished writing plots to file ", pdfFile)) |
|
61 |
+ futile.logger::flog.info(paste0(" Finished writing plots")) |
|
62 | 62 |
} |
63 | 63 |
|
64 | 64 |
.startLogger <- function(logfile, level, removeOldLog = TRUE, appenderName = "consoleAndFile", verbose = TRUE) { |
... | ... |
@@ -719,3 +719,23 @@ is.installed <- function(mypkg){ |
719 | 719 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
720 | 720 |
} |
721 | 721 |
} |
722 |
+ |
|
723 |
+ |
|
724 |
+.getBiomartParameters <- function(genomeAssembly) { |
|
725 |
+ |
|
726 |
+ .checkOntologyPackageInstallation("biomaRt") |
|
727 |
+ |
|
728 |
+ if (grepl(x = genomeAssembly, pattern = "^hg\\d\\d")){ |
|
729 |
+ dataset = "hsapiens_gene_ensembl" |
|
730 |
+ if (genomeAssembly == "hg38") { |
|
731 |
+ host = "https://www.ensembl.org" |
|
732 |
+ } else if (genomeAssembly == "hg19") { |
|
733 |
+ host ="https://grch37.ensembl.org" |
|
734 |
+ } |
|
735 |
+ } else if (grep(x = genomeAssembly, pattern = "^mm\\d\\d")){ |
|
736 |
+ dataset = "mmusculus_gene_ensembl" |
|
737 |
+ host = "https://www.ensembl.org" |
|
738 |
+ } |
|
739 |
+ |
|
740 |
+ list(dataset = dataset, host = host) |
|
741 |
+} |
|
722 | 742 |
\ No newline at end of file |
... | ... |
@@ -483,17 +483,13 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
483 | 483 |
|
484 | 484 |
.checkOntologyPackageInstallation("biomaRt") |
485 | 485 |
|
486 |
- if (grep(x = GRN@config$parameters$genomeAssembly, pattern = "^hg\\d\\d" )){ |
|
487 |
- dataset = "hsapiens_gene_ensembl" |
|
488 |
- } else if (grep(x = GRN@config$parameters$genomeAssembly, pattern = "^mm\\d\\d")){ |
|
489 |
- dataset = "mmusculus_gene_ensembl" |
|
490 |
- } |
|
486 |
+ params.l = .getBiomartParameters(GRN@config$parameters$genomeAssembly) |
|
491 | 487 |
|
492 | 488 |
errorOcured = FALSE |
493 | 489 |
|
494 | 490 |
ensembl = tryCatch({ |
495 |
- biomaRt::useEnsembl(biomart = "genes", dataset = dataset) |
|
496 |
- |
|
491 |
+ biomaRt::useEnsembl(biomart = "genes", host = params.l[["host"]], dataset = params.l[["dataset"]]) |
|
492 |
+ |
|
497 | 493 |
}, error = function(e) { |
498 | 494 |
errorOcured = TRUE |
499 | 495 |
} |
... | ... |
@@ -1019,7 +1015,7 @@ getTopNodes <- function(GRN, nodeType, rankType, n = 0.1, use_TF_gene_network = |
1019 | 1015 |
#' |
1020 | 1016 |
#' @inheritParams calculateGeneralEnrichment |
1021 | 1017 |
#' @param rankType Character. Default \code{"degree"}. One of: \code{"degree"}, \code{"EV"}, \code{"custom"}. This parameter will determine the criterion to be used to identify the "top" TFs. If set to "degree", the function will select top TFs based on the number of connections to genes they have, i.e. based on their degree-centrality. If set to \code{"EV"} it will select the top TFs based on their eigenvector-centrality score in the network. If set to custom, a set of TF names will have to be passed to the "TF.names" parameter. |
1022 |
-#' @param n Numeric. Default 0.1. If this parameter is passed as a value between (0,1), it is treated as a percentage of top nodes. If the value is passed as an integer it will be treated as the number of top nodes. This parameter is not relevant if \code{rankType = "custom"}. |
|
1018 |
+#' @param n Numeric. Default 3. If this parameter is passed as a value between 0 and 1, it is treated as a percentage of top nodes. If the value is passed as an integer it will be treated as the number of top nodes. This parameter is not relevant if \code{rankType = "custom"}. |
|
1023 | 1019 |
#' @param TF.names Character vector. Default \code{NULL}. If the rank type is set to \code{"custom"}, a vector of TF names for which the GO enrichment should be calculated should be passed to this parameter. |
1024 | 1020 |
#' @return The same \code{\linkS4class{GRN}} object, with the enrichment results stored in the \code{stats$Enrichment$byTF} slot. |
1025 | 1021 |
#' @examples |
... | ... |
@@ -1027,7 +1023,7 @@ getTopNodes <- function(GRN, nodeType, rankType, n = 0.1, use_TF_gene_network = |
1027 | 1023 |
#' GRN = loadExampleObject() |
1028 | 1024 |
#' GRN = calculateTFEnrichment(GRN, n = 5, ontology = "GO_BP", forceRerun = FALSE) |
1029 | 1025 |
#' @export |
1030 |
-calculateTFEnrichment <- function(GRN, rankType = "degree", n = 0.1, TF.names = NULL, |
|
1026 |
+calculateTFEnrichment <- function(GRN, rankType = "degree", n = 3, TF.names = NULL, |
|
1031 | 1027 |
ontology = c("GO_BP", "GO_MF"), algorithm = "weight01", |
1032 | 1028 |
statistic = "fisher", background = "neighborhood", |
1033 | 1029 |
pAdjustMethod = "BH", |
... | ... |
@@ -6,13 +6,15 @@ |
6 | 6 |
#' |
7 | 7 |
#' @template GRN |
8 | 8 |
#' @template outputFolder |
9 |
-#' @param type Character. Either "peaks" or "rna" or "all". Type of data to plot a PCA for. "peaks" corresponds to the the open chromatin data, while "rna" refers to the RNA-seq counts. If set to "all", PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data). |
|
10 |
-#' @param topn Integer vector. Default c(500,1000,5000). Number of top variable features to do PCA for. Can be a vector of different numbers (see default). |
|
9 |
+#' @param data Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data). |
|
10 |
+#' @param topn Integer vector. Default \code{c(500,1000,5000)}. Number of top variable features to do PCA for. Can be a vector of different numbers (see default). |
|
11 |
+#' @param type Character. One of or a combination of \code{"raw"}, \code{"normalized"}, \code{"all"}. Default \code{c("raw", "normalized")}. Should the PCA plots be done based on the raw or normalized data, respectively? |
|
11 | 12 |
#' @template forceRerun |
12 | 13 |
#' @template basenameOutput |
13 | 14 |
#' @template plotAsPDF |
14 | 15 |
#' @template pdf_width |
15 | 16 |
#' @template pdf_height |
17 |
+#' @template pages |
|
16 | 18 |
#' @return The same \code{\linkS4class{GRN}} object, without modifications. In addition, for each specified \code{type}, a PDF file is produced with a PCA. We refer to the Vignettes for details and further explanations. |
17 | 19 |
#' @examples |
18 | 20 |
#' # See the Workflow vignette on the GRaNIE website for examples |
... | ... |
@@ -20,17 +22,19 @@ |
20 | 22 |
#' # GRN = plotPCA_all(GRN, topn = 500, type = "rna", plotAsPDF = FALSE) |
21 | 23 |
#' @export |
22 | 24 |
plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
23 |
- type = c("rna", "peaks"), topn = c(500,1000,5000), |
|
24 |
- plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, |
|
25 |
+ data = c("rna", "peaks"), topn = c(500,1000,5000), type = c("raw", "normalized"), |
|
26 |
+ plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, pages = NULL, |
|
25 | 27 |
forceRerun = FALSE) { |
26 | 28 |
|
27 | 29 |
GRN = .addFunctionLogToObject(GRN) |
28 | 30 |
|
29 | 31 |
checkmate::assertClass(GRN, "GRN") |
30 |
- checkmate::assertSubset(type , c("rna", "peaks", "all")) |
|
32 |
+ checkmate::assertSubset(data , c("rna", "peaks", "all")) |
|
33 |
+ checkmate::assertSubset(type , c("raw", "normalized", "all")) |
|
31 | 34 |
checkmate::assertFlag(plotAsPDF) |
32 | 35 |
checkmate::assertNumeric(pdf_width, lower = 5, upper = 99) |
33 | 36 |
checkmate::assertNumeric(pdf_height, lower = 5, upper = 99) |
37 |
+ checkmate::assert(checkmate::checkNull(pages), checkmate::checkIntegerish(pages, lower = 1)) |
|
34 | 38 |
checkmate::assertFlag(forceRerun) |
35 | 39 |
checkmate::assert(checkmate::checkNull(basenameOutput), checkmate::checkCharacter(basenameOutput, len = 1, min.chars = 1, any.missing = FALSE)) |
36 | 40 |
|
... | ... |
@@ -43,7 +47,10 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
43 | 47 |
|
44 | 48 |
uniqueSamples = GRN@config$sharedSamples |
45 | 49 |
|
46 |
- if ("rna" %in% type | "all" %in% type) { |
|
50 |
+ # Initialize the page counter |
|
51 |
+ pageCounter = 1 |
|
52 |
+ |
|
53 |
+ if ("rna" %in% data | "all" %in% data) { |
|
47 | 54 |
|
48 | 55 |
# Check if we have raw integer counts. Ust vst transform if, otherwise "regular log2 |
49 | 56 |
transformation = dplyr::if_else(checkmate::testClass(GRN@data$RNA$counts_orig, "DESeqDataSet"), "vst", "log2") |
... | ... |
@@ -55,6 +62,10 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
55 | 62 |
} |
56 | 63 |
|
57 | 64 |
for (norm in normVector) { |
65 |
+ |
|
66 |
+ if (!(norm %in% type | "all" %in% type)) { |
|
67 |
+ next |
|
68 |
+ } |
|
58 | 69 |
|
59 | 70 |
# Only do a transformation if this is raw counts, already normalized counts dont need another transformation |
60 | 71 |
transformationCur = dplyr::if_else(norm == "raw", transformation, "none") |
... | ... |
@@ -85,7 +96,7 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
85 | 96 |
" RNA data for all shared samples to file ", fileCur , |
86 | 97 |
"... This may take a few minutes")) |
87 | 98 |
|
88 |
- .plot_PCA_wrapper(GRN, matrixCur, transformation = transformationCur, logTransformDensity = logTransformDensity, file = fileCur, topn = topn, pdf_width = pdf_width, pdf_height = pdf_height, plotAsPDF = plotAsPDF) |
|
99 |
+ .plot_PCA_wrapper(GRN, matrixCur, transformation = transformationCur, logTransformDensity = logTransformDensity, file = fileCur, topn = topn, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, plotAsPDF = plotAsPDF) |
|
89 | 100 |
|
90 | 101 |
} else { |
91 | 102 |
futile.logger::flog.info(paste0("File ", fileCur, " already exists, not overwriting due to forceRerun = FALSE")) |
... | ... |
@@ -95,7 +106,7 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
95 | 106 |
|
96 | 107 |
} # end if plot RNA PCA |
97 | 108 |
|
98 |
- if ("peaks" %in% type | "all" %in% type) { |
|
109 |
+ if ("peaks" %in% data | "all" %in% data) { |
|
99 | 110 |
|
100 | 111 |
# Check if we have raw integer counts |
101 | 112 |
transformation = dplyr::if_else(checkmate::testClass(GRN@data$peaks$counts_orig, "DESeqDataSet"), "vst", "log2") |
... | ... |
@@ -131,7 +142,7 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
131 | 142 |
" peaks data for all shared samples to file ", fileCur , |
132 | 143 |
"... This may take a few minutes")) |
133 | 144 |
|
134 |
- .plot_PCA_wrapper(GRN, matrixCur, transformation = transformationCur, logTransformDensity = logTransformDensity, file = fileCur, topn = topn, pdf_width = pdf_width, pdf_height = pdf_height, plotAsPDF = plotAsPDF) |
|
145 |
+ .plot_PCA_wrapper(GRN, matrixCur, transformation = transformationCur, logTransformDensity = logTransformDensity, file = fileCur, topn = topn, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, plotAsPDF = plotAsPDF) |
|
135 | 146 |
|
136 | 147 |
} else { |
137 | 148 |
futile.logger::flog.info(paste0("File ", fileCur, " already exists, not overwriting due to forceRerun = FALSE")) |
... | ... |
@@ -149,7 +160,7 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
149 | 160 |
} |
150 | 161 |
|
151 | 162 |
|
152 |
-.plot_PCA_wrapper <- function(GRN, counts, transformation = "vst", scale = TRUE, logTransformDensity, file, topn, pdf_width, pdf_height, plotAsPDF) { |
|
163 |
+.plot_PCA_wrapper <- function(GRN, counts, transformation = "vst", scale = TRUE, logTransformDensity, file, topn, pdf_width, pdf_height, pages = NULL, plotAsPDF) { |
|
153 | 164 |
|
154 | 165 |
checkmate::assertChoice(transformation, c("vst", "log2", "none")) |
155 | 166 |
checkmate::assertIntegerish(topn, lower = 50, any.missing = FALSE,min.len = 1) |
... | ... |
@@ -175,7 +186,7 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
175 | 186 |
|
176 | 187 |
futile.logger::flog.info(paste0("Prepare PCA. Count transformation: ", transformation)) |
177 | 188 |
|
178 |
- |
|
189 |
+ pageCounter = 1 |
|
179 | 190 |
if (plotAsPDF) { |
180 | 191 |
.checkOutputFile(file) |
181 | 192 |
grDevices::pdf(file, width = pdf_width, height = pdf_height) |
... | ... |
@@ -188,7 +199,11 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
188 | 199 |
rv <- matrixStats::rowVars(counts.transf) |
189 | 200 |
|
190 | 201 |
# 0. Density plot for ALL features |
191 |
- .plotDensity(counts.transf, logTransform = logTransformDensity, legend = dplyr::if_else(ncol(counts.transf) > 20, FALSE, TRUE)) |
|
202 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
203 |
+ .plotDensity(counts.transf, logTransform = logTransformDensity, legend = dplyr::if_else(ncol(counts.transf) > 20, FALSE, TRUE)) |
|
204 |
+ |
|
205 |
+ } |
|
206 |
+ pageCounter = pageCounter + 1 |
|
192 | 207 |
|
193 | 208 |
for (topn in topn) { |
194 | 209 |
|
... | ... |
@@ -209,95 +224,105 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
209 | 224 |
pos = seq_len(nPCS)) |
210 | 225 |
|
211 | 226 |
# 1. Scree plot |
212 |
- g = ggplot(screeplot.df, aes(PCs, variation)) + geom_bar(stat = "identity") + |
|
213 |
- xlab("Principal component (PC)") + ylab("Explained variation / Cumultative sum (in %)") + |
|
214 |
- geom_point(data = screeplot.df, aes(pos, variation_sum), color = "red") + |
|
215 |
- geom_line(data = screeplot.df, aes(pos, variation_sum), color = "red") + |
|
216 |
- ggtitle(paste0("Screeplot for top ", topn, " variable features")) + |
|
217 |
- theme_bw() |
|
218 |
- |
|
219 |
- plot(g) |
|
220 |
- |
|
227 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
228 |
+ |
|
229 |
+ g = ggplot(screeplot.df, aes(PCs, variation)) + geom_bar(stat = "identity") + |
|
230 |
+ xlab("Principal component (PC)") + ylab("Explained variation / Cumultative sum (in %)") + |
|
231 |
+ geom_point(data = screeplot.df, aes(pos, variation_sum), color = "red") + |
|
232 |
+ geom_line(data = screeplot.df, aes(pos, variation_sum), color = "red") + |
|
233 |
+ ggtitle(paste0("Screeplot for top ", topn, " variable features")) + |
|
234 |
+ theme_bw() |
|
235 |
+ |
|
236 |
+ plot(g) |
|
237 |
+ } |
|
238 |
+ pageCounter = pageCounter + 1 |
|
221 | 239 |
|
222 | 240 |
# 2. Metadata correlation with PCs |
223 |
- metadata_columns = .findSuitableColumnsToPlot(metadata, remove1LevelFactors = TRUE, verbose = FALSE) |
|
224 |
- .plotPCA_QC(pcaResult = pca, dataCols = colnames(counts.transf), metadataTable = metadata, metadataColumns = metadata_columns, |
|
225 |
- varn = topn, metadataColumns_fill = metadata_columns, |
|
226 |
- file = NULL, logTransform = FALSE) |
|
227 |
- |
|
241 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
242 |
+ metadata_columns = .findSuitableColumnsToPlot(metadata, remove1LevelFactors = TRUE, verbose = FALSE) |
|
243 |
+ .plotPCA_QC(pcaResult = pca, dataCols = colnames(counts.transf), metadataTable = metadata, metadataColumns = metadata_columns, |
|
244 |
+ varn = topn, metadataColumns_fill = metadata_columns, |
|
245 |
+ file = NULL, logTransform = FALSE) |
|
246 |
+ } |
|
247 |
+ pageCounter = pageCounter + 1 |
|
228 | 248 |
|
229 | 249 |
|
230 | 250 |
# 3. Correlation plots colored by metadata |
231 | 251 |
metadata_columns = .findSuitableColumnsToPlot(metadata, remove1LevelFactors = FALSE, verbose = FALSE) |
232 | 252 |
for (varCur in metadata_columns) { |
233 | 253 |
|
234 |
- plotTitle = paste0("Top ", topn, " variable features, colored by \n", varCur) |
|
235 |
- |
|
236 |
- skipLegend = FALSE |
|
237 |
- vecCur = metadata[, varCur] %>%unlist() |
|
238 |
- if (is.character(vecCur) & length(unique(vecCur)) > 30) { |
|
239 |
- skipLegend = TRUE |
|
240 |
- plotTitle = paste0(plotTitle, " (legend skipped)") |
|
241 |
- } |
|
242 |
- |
|
243 |
- result = tryCatch({ |
|
244 |
- |
|
245 |
- # The following code is taken from plotPCA from DESeq2 and adapted here to be more flexible |
|
246 |
- intgroup.df <- as.data.frame(metadata[, varCur, drop=FALSE]) |
|
247 |
- |
|
248 |
- # add the varCur factors together to create a new grouping factor |
|
249 |
- group = metadata[[varCur]] |
|
250 |
- |
|
251 |
- # assembly the data for the plot |
|
252 |
- d <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], group=group, intgroup.df, name=colnames(counts.transf)) |
|
253 |
- |
|
254 |
- d = d %>% |
|
255 |
- dplyr::mutate_if(is.character, as.factor) %>% |
|
256 |
- dplyr::mutate_if(is.logical, as.factor) |
|
257 |
- |
|
258 |
- |
|
259 |
- g = g = ggplot(data=d, aes(x=PC1, y=PC2, color=group)) + geom_point(size=3) + |
|
260 |
- xlab(paste0("PC1: ",round(percentVar[1] * 100, 1),"% variance")) + |
|
261 |
- ylab(paste0("PC2: ",round(percentVar[2] * 100, 1),"% variance")) + |
|
262 |
- ggtitle(plotTitle) + coord_fixed() |
|
263 |
- |
|
264 |
- |
|
265 |
- |
|
266 |
- if (is.factor(d$group)) { |
|
267 |
- g = g + scale_color_viridis_d() |
|
268 |
- |
|
269 |
- } else { |
|
270 |
- |
|
271 |
- is.date <- function(x) inherits(x, 'Date') |
|
272 |
- |
|
273 |
- if (!is.date(d$group)) { |
|
274 |
- g = g + scale_color_viridis_c() |
|
275 |
- } else { |
|
276 |
- g = g + scale_color_viridis_c(trans = "date") |
|
277 |
- } |
|
278 |
- |
|
279 |
- |
|
254 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
255 |
+ |
|
256 |
+ plotTitle = paste0("Top ", topn, " variable features, colored by \n", varCur) |
|
257 |
+ |
|
258 |
+ skipLegend = FALSE |
|
259 |
+ vecCur = metadata[, varCur] %>%unlist() |
|
260 |
+ if (is.character(vecCur) & length(unique(vecCur)) > 30) { |
|
261 |
+ skipLegend = TRUE |
|
262 |
+ plotTitle = paste0(plotTitle, " (legend skipped)") |
|
263 |
+ } |
|
264 |
+ |
|
265 |
+ result = tryCatch({ |
|
266 |
+ |
|
267 |
+ # The following code is taken from plotPCA from DESeq2 and adapted here to be more flexible |
|
268 |
+ intgroup.df <- as.data.frame(metadata[, varCur, drop=FALSE]) |
|
269 |
+ |
|
270 |
+ # add the varCur factors together to create a new grouping factor |
|
271 |
+ group = metadata[[varCur]] |
|
272 |
+ |
|
273 |
+ # assembly the data for the plot |
|
274 |
+ d <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], group=group, intgroup.df, name=colnames(counts.transf)) |
|
275 |
+ |
|
276 |
+ d = d %>% |
|
277 |
+ dplyr::mutate_if(is.character, as.factor) %>% |
|
278 |
+ dplyr::mutate_if(is.logical, as.factor) |
|
279 |
+ |
|
280 |
+ |
|
281 |
+ g = g = ggplot(data=d, aes(x=PC1, y=PC2, color=group)) + geom_point(size=3) + |
|
282 |
+ xlab(paste0("PC1: ",round(percentVar[1] * 100, 1),"% variance")) + |
|
283 |
+ ylab(paste0("PC2: ",round(percentVar[2] * 100, 1),"% variance")) + |
|
284 |
+ ggtitle(plotTitle) + coord_fixed() |
|
285 |
+ |
|
286 |
+ |
|
287 |
+ |
|
288 |
+ if (is.factor(d$group)) { |
|
289 |
+ g = g + scale_color_viridis_d() |
|
290 |
+ |
|
291 |
+ } else { |
|
292 |
+ |
|
293 |
+ is.date <- function(x) inherits(x, 'Date') |
|
294 |
+ |
|
295 |
+ if (!is.date(d$group)) { |
|
296 |
+ g = g + scale_color_viridis_c() |
|
297 |
+ } else { |
|
298 |
+ g = g + scale_color_viridis_c(trans = "date") |
|
299 |
+ } |
|
300 |
+ |
|
301 |
+ |
|
302 |
+ } |
|
303 |
+ |
|
304 |
+ |
|
305 |
+ if (skipLegend) { |
|
306 |
+ g = g + theme(legend.position = "none") |
|
307 |
+ } |
|
308 |
+ |
|
309 |
+ plot(g) |
|
310 |
+ |
|
311 |
+ }, error = function(e) { |
|
312 |
+ futile.logger::flog.warn(paste0(" Could not plot PCA with variable ", varCur)) |
|
313 |
+ plot(c(0, 1), c(0, 1), ann = FALSE, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n') |
|
314 |
+ message = paste0(" Could not plot PCA with variable ", varCur) |
|
315 |
+ text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red") |
|
316 |
+ }) |
|
280 | 317 |
} |
281 |
- |
|
282 |
- |
|
283 |
- if (skipLegend) { |
|
284 |
- g = g + theme(legend.position = "none") |
|
285 |
- } |
|
286 |
- |
|
287 |
- plot(g) |
|
288 |
- |
|
289 |
- }, error = function(e) { |
|
290 |
- futile.logger::flog.warn(paste0(" Could not plot PCA with variable ", varCur)) |
|
291 |
- plot(c(0, 1), c(0, 1), ann = FALSE, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n') |
|
292 |
- message = paste0(" Could not plot PCA with variable ", varCur) |
|
293 |
- text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red") |
|
294 |
- }) |
|
318 |
+ pageCounter = pageCounter + 1 |
|
295 | 319 |
|
296 | 320 |
|
297 |
- } |
|
321 |
+ } # end for each variable |
|
298 | 322 |
|
299 | 323 |
} |
300 | 324 |
|
325 |
+ .printWarningPageNumber(pages, pageCounter) |
|
301 | 326 |
if (plotAsPDF) grDevices::dev.off() |
302 | 327 |
|
303 | 328 |
} |
... | ... |
@@ -426,7 +451,7 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
426 | 451 |
cell_fun = function(j, i, x, y, width, height, fill) { |
427 | 452 |
grid::grid.text(sprintf("%.2f", r2_cv[i, j]), x, y, gp = grid::gpar(fontsize = 10)) |
428 | 453 |
} |
429 |
- ) |
|
454 |
+ ) %>% plot() |
|
430 | 455 |
|
431 | 456 |
|
432 | 457 |
|
... | ... |
@@ -450,8 +475,12 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
450 | 475 |
#' @template outputFolder |
451 | 476 |
#' @template basenameOutput |
452 | 477 |
#' @template plotDetails |
453 |
-#' @param plotPermuted \code{TRUE} or \code{FALSE}. Default \code{TRUE}. Also produce the diagnostic plots for permuted data? |
|
478 |
+#' @param dataType Character vector. One of, or both of, \code{"real"} or \code{"permuted"}. For which data type, real or permuted data, to produce the diagnostic plots? |
|
454 | 479 |
#' @param nTFMax \code{NULL} or Integer. Default \code{NULL}. Maximum number of TFs to process. Can be used for testing purposes by setting this to a small number i(.e., 10) |
480 |
+#' @template plotAsPDF |
|
481 |
+#' @template pdf_width |
|
482 |
+#' @param pdf_height_base Number. Default 8. Base height of the PDF, in cm, per connection type. The total height is automatically determined based on the number of connection types that are found in the object (e.g., expression or TF activity). For example, when two connection types are found, the base height is multiplied by 2. |
|
483 |
+#' @template pages |
|
455 | 484 |
#' @template forceRerun |
456 | 485 |
#' @return The same \code{\linkS4class{GRN}} object, with added data from this function. |
457 | 486 |
#' @examples |
... | ... |
@@ -463,8 +492,9 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
463 | 492 |
outputFolder = NULL, |
464 | 493 |
basenameOutput = NULL, |
465 | 494 |
plotDetails = FALSE, |
466 |
- plotPermuted = TRUE, |
|
495 |
+ dataType = c("real", "permuted"), |
|
467 | 496 |
nTFMax = NULL, |
497 |
+ plotAsPDF = TRUE, pdf_width = 12, pdf_height_base = 8, pages = NULL, |
|
468 | 498 |
forceRerun = FALSE) { |
469 | 499 |
|
470 | 500 |
GRN = .addFunctionLogToObject(GRN) |
... | ... |
@@ -473,8 +503,12 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
473 | 503 |
checkmate::assert(checkmate::checkNull(outputFolder), checkmate::checkCharacter(outputFolder, min.chars = 1)) |
474 | 504 |
checkmate::assert(checkmate::checkNull(basenameOutput), checkmate::checkCharacter(basenameOutput, len = 1, min.chars = 1, any.missing = FALSE)) |
475 | 505 |
checkmate::assertFlag(plotDetails) |
476 |
- checkmate::assertFlag(plotPermuted) |
|
506 |
+ checkmate::assertSubset(dataType, c("real", "permuted")) |
|
477 | 507 |
checkmate::assert(checkmate::checkNull(nTFMax), checkmate::checkIntegerish(nTFMax)) |
508 |
+ checkmate::assertFlag(plotAsPDF) |
|
509 |
+ checkmate::assertNumeric(pdf_width, lower = 5, upper = 99) |
|
510 |
+ checkmate::assertNumeric(pdf_height_base, lower = 5, upper = 99) |
|
511 |
+ checkmate::assert(checkmate::checkNull(pages), checkmate::checkIntegerish(pages, lower = 1)) |
|
478 | 512 |
checkmate::assertFlag(forceRerun) |
479 | 513 |
|
480 | 514 |
useGCCorrection = GRN@config$parameters$useGCCorrection |
... | ... |
@@ -482,29 +516,38 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
482 | 516 |
|
483 | 517 |
outputFolder = .checkOutputFolder(GRN, outputFolder) |
484 | 518 |
|
519 |
+ dataType2 = c() |
|
520 |
+ if ("real" %in% dataType) dataType2 = c(0) |
|
521 |
+ if ("permuted" %in% dataType) dataType2 = c(dataType2, 1) |
|
522 |
+ |
|
485 | 523 |
for (permutationCur in 0:.getMaxPermutation(GRN)) { |
486 | 524 |
|
487 |
- if (!plotPermuted & permutationCur != 0) { |
|
525 |
+ if (!permutationCur %in% dataType2) { |
|
488 | 526 |
next |
489 | 527 |
} |
490 | 528 |
|
491 | 529 |
suffixFile = .getPermutationSuffixStr(permutationCur) |
492 | 530 |
|
493 |
- fileCur = paste0(outputFolder, |
|
494 |
- dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_TFPeak_fdr"), basenameOutput), |
|
531 |
+ fileCur = paste0(outputFolder, dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_TFPeak_fdr"), basenameOutput), |
|
495 | 532 |
suffixFile, ".pdf") |
496 | 533 |
|
497 |
- if (!file.exists(fileCur) | forceRerun) { |
|
534 |
+ |
|
535 |
+ if (!file.exists(fileCur) | !plotAsPDF | forceRerun) { |
|
536 |
+ |
|
537 |
+ if (!plotAsPDF) fileCur = NULL |
|
498 | 538 |
|
499 |
- heightCur = 8* length(GRN@config$TF_peak_connectionTypes) |
|
500 |
- .plot_TF_peak_fdr(GRN, perm = permutationCur, useGCCorrection = useGCCorrection, |
|
501 |
- plotDetails = plotDetails, fileCur, width = 7, height = heightCur, |
|
502 |
- nPagesMax = nTFMax) |
|
539 |
+ heightCur = pdf_height_base * length(GRN@config$TF_peak_connectionTypes) |
|
540 |
+ .plot_TF_peak_fdr(GRN, perm = permutationCur, useGCCorrection = useGCCorrection, |
|
541 |
+ plotDetails = plotDetails, fileCur, width = pdf_width, height = heightCur, |
|
542 |
+ nPagesMax = nTFMax, pages = pages) |
|
503 | 543 |
} |
504 | 544 |
|
545 |
+ # TODO: page selection not implemented here yet |
|
505 | 546 |
fileCur = paste0(outputFolder, .getOutputFileName("plot_TFPeak_fdr_GC"), suffixFile) |
506 |
- if (useGCCorrection & (!file.exists(fileCur) | forceRerun)) { |
|
507 |
- .plotTF_peak_GC_diagnosticPlots(GRN, perm = permutationCur, fileCur, width = 7, height = 8) |
|
547 |
+ if (useGCCorrection & (!file.exists(fileCur) | !plotAsPDF | forceRerun)) { |
|
548 |
+ |
|
549 |
+ if (!plotAsPDF) fileCur = NULL |
|
550 |
+ .plotTF_peak_GC_diagnosticPlots(GRN, perm = permutationCur, fileCur, width = pdf_width, height = pdf_height_base) |
|
508 | 551 |
} |
509 | 552 |
|
510 | 553 |
# TODO: handle multiple activities and actually write this function |
... | ... |
@@ -519,7 +562,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
519 | 562 |
|
520 | 563 |
} |
521 | 564 |
|
522 |
- |
|
565 |
+# TODO: Currently not used anywhere |
|
523 | 566 |
.generateTF_GC_diagnosticPlots <- function(TFCur, GC_classes_foreground.df, GC_classes_background.df, GC_classes_all.df, peaksForeground, peaksBackground, peaksBackgroundGC) { |
524 | 567 |
|
525 | 568 |
GC_classes_background_GC.df = peaksBackgroundGC %>% |
... | ... |
@@ -627,15 +670,15 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
627 | 670 |
} |
628 | 671 |
|
629 | 672 |
|
630 |
-.plot_TF_peak_fdr <- function(GRN, perm, useGCCorrection, plotDetails = FALSE, file = NULL, width = 7, height = 7, nPagesMax = NULL) { |
|
673 |
+.plot_TF_peak_fdr <- function(GRN, perm, useGCCorrection, plotDetails = FALSE, file = NULL, width = 7, height = 7, nPagesMax = NULL, pages = NULL) { |
|
631 | 674 |
|
632 | 675 |
start = Sys.time() |
633 | 676 |
futile.logger::flog.info(paste0("Plotting FDR curves for each TF", dplyr::if_else(is.null(file), "", paste0(" to file ", file)))) |
634 | 677 |
|
635 |
- if (!is.null(file)) { |
|
636 |
- plots.l = list() |
|
637 |
- index = 1 |
|
638 |
- } |
|
678 |
+ pageCounter = 1 |
|
679 |
+ |
|
680 |
+ plots.l = list() |
|
681 |
+ index = 1 |
|
639 | 682 |
|
640 | 683 |
# Dont take all TF, some might be missing. |
641 | 684 |
connections_TF_peak = GRN@connections$TF_peaks[[as.character(perm)]]$connectionStats |
... | ... |
@@ -755,35 +798,42 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
755 | 798 |
|
756 | 799 |
} |
757 | 800 |
|
758 |
- |
|
759 |
- |
|
760 |
- if (is.null(file)) { |
|
761 |
- plot(g) |
|
762 |
- } else { |
|
763 |
- plots.l[[index]] = g |
|
764 |
- index = index + 1 |
|
765 |
- |
|
766 |
- if (plotDetails) { |
|
767 |
- plots.l[[index]] = g2 |
|
801 |
+ # We now created either just one plot g or g + g2 for "one" direction |
|
802 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
803 |
+ plots.l[[index]] = g |
|
768 | 804 |
index = index + 1 |
769 |
- } |
|
770 |
- |
|
771 | 805 |
} |
806 |
+ |
|
772 | 807 |
|
808 |
+ if (plotDetails) { |
|
809 |
+ |
|
810 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
811 |
+ plots.l[[index]] = g2 |
|
812 |
+ index = index + 1 |
|
813 |
+ } |
|
814 |
+ |
|
815 |
+ # Only here we already have a new page, as we produced 2 plots in this iteration alone |
|
816 |
+ pageCounter = pageCounter + 1 |
|
817 |
+ } |
|
818 |
+ |
|
773 | 819 |
|
820 |
+ } # end for both directions |
|
821 |
+ |
|
822 |
+ if (!plotDetails) { |
|
823 |
+ pageCounter = pageCounter + 1 |
|
774 | 824 |
} |
825 |
+ |
|
826 |
+ |
|
775 | 827 |
} # end allTF |
776 | 828 |
|
777 |
- if (!is.null(file)) { |
|
778 |
- futile.logger::flog.info(paste0(" Finished generating plots, start plotting to file ",file, ". This may take a few minutes.")) |
|
779 |
- .printMultipleGraphsPerPage(plots.l, nCol = 1, nRow = 2, pdfFile = file, height = height, width = width) |
|
780 |
- |
|
781 |
- } |
|
829 |
+ .printWarningPageNumber(pages, pageCounter) |
|
830 |
+ |
|
831 |
+ .printMultipleGraphsPerPage(plots.l, nCol = 1, nRow = 2, pdfFile = file, height = height, width = width) |
|
782 | 832 |
|
783 | 833 |
.printExecutionTime(start) |
784 | 834 |
} |
785 | 835 |
|
786 |
-.plotTF_peak_TFActivity_QC <- function(GRN, perm = permIndex, fileCur, width = 7, height = 8) { |
|
836 |
+.plotTF_peak_TFActivity_QC <- function(GRN, perm = 0, fileCur, width = 7, height = 8) { |
|
787 | 837 |
|
788 | 838 |
# TODO |
789 | 839 |
} |
... | ... |
@@ -796,13 +846,14 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
796 | 846 |
#' @template GRN |
797 | 847 |
#' @template outputFolder |
798 | 848 |
#' @template basenameOutput |
799 |
-#' @param gene.types List of character vectors. Default list(c("all"), c("protein_coding"), c("protein_coding", "lincRNA")). Vectors of gene types to consider for the diagnostic plots. Multiple distinct combinations of gene types can be specified. For example, if set to \code{list(c("protein_coding", "lincRNA"), c("protein_coding"), c("all"))}, 3 distinct PDFs will be produced, one for each element of the list. The first file would only consider protein-coding and lincRNA genes, while the second plot only considers protein-coding ones. The special keyword "all" denotes all gene types found (usually, there are many gene types present, also more exotic and rare ones). |
|
849 |
+#' @param gene.types List of character vectors. Default list(c("protein_coding", "lincRNA")). Vectors of gene types to consider for the diagnostic plots. Multiple distinct combinations of gene types can be specified. For example, if set to \code{list(c("protein_coding", "lincRNA"), c("protein_coding"), c("all"))}, 3 distinct PDFs will be produced, one for each element of the list. The first file would only consider protein-coding and lincRNA genes, while the second plot only considers protein-coding ones. The special keyword "all" denotes all gene types found (usually, there are many gene types present, also more exotic and rare ones). |
|
800 | 850 |
#' @param useFiltered Logical. TRUE or FALSE. Default FALSE. If set to \code{FALSE}, the diagnostic plots will be produced based on all peak-gene connections. This is the default and will usually be best to judge whether the background behaves as expected. If set to TRUE, the diagnostic plots will be produced based on the filtered set of connections. For this, the function \code{link{filterGRNAndConnectGenes}} must have been run before. |
801 | 851 |
#' @template plotDetails |
802 | 852 |
#' @param plotPerTF Logical. TRUE or FALSE. Default \code{FALSE}. If set to \code{FALSE}, the diagnostic plots will be done across all TF (the default), while setting it to \code{TRUE} will generate the QC plots TF-specifically, including "all" TF, sorted by the number of connections. |
803 | 853 |
#' @template plotAsPDF |
804 | 854 |
#' @template pdf_width |
805 | 855 |
#' @template pdf_height |
856 |
+#' @template pages |
|
806 | 857 |
#' @template forceRerun |
807 | 858 |
#' @return The same \code{\linkS4class{GRN}} object, with added data from this function. |
808 | 859 |
#' @examples |
... | ... |
@@ -815,11 +866,11 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
815 | 866 |
plotDiagnosticPlots_peakGene <- function(GRN, |
816 | 867 |
outputFolder = NULL, |
817 | 868 |
basenameOutput = NULL, |
818 |
- gene.types = list(c("all"), c("protein_coding"), c("protein_coding", "lincRNA")), |
|
869 |
+ gene.types = list(c("protein_coding", "lincRNA")), |
|
819 | 870 |
useFiltered = FALSE, |
820 | 871 |
plotDetails = FALSE, |
821 | 872 |
plotPerTF = FALSE, |
822 |
- plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, |
|
873 |
+ plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, pages = NULL, |
|
823 | 874 |
forceRerun= FALSE) { |
824 | 875 |
|
825 | 876 |
GRN = .addFunctionLogToObject(GRN) |
... | ... |
@@ -834,6 +885,7 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
834 | 885 |
checkmate::assertFlag(plotAsPDF) |
835 | 886 |
checkmate::assertNumeric(pdf_width, lower = 5, upper = 99) |
836 | 887 |
checkmate::assertNumeric(pdf_height, lower = 5, upper = 99) |
888 |
+ checkmate::assert(checkmate::checkNull(pages), checkmate::checkIntegerish(pages, lower = 1)) |
|
837 | 889 |
checkmate::assertFlag(forceRerun) |
838 | 890 |
|
839 | 891 |
|
... | ... |
@@ -864,20 +916,9 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
864 | 916 |
plotPerTF = plotPerTF, |
865 | 917 |
pdf_width = pdf_width, pdf_height = pdf_height, |
866 | 918 |
plotDetails = plotDetails, |
919 |
+ pages = pages, |
|
867 | 920 |
forceRerun = forceRerun) |
868 | 921 |
|
869 |
- # Summarize all data, both real and random |
|
870 |
- # filteredStr = dplyr::if_else(useFiltered, "_filtered", "") |
|
871 |
- # for (geneTypesSelected in gene.types) { |
|
872 |
- # |
|
873 |
- # filenameCur = paste0(outputFolder, .getOutputFileName("plot_peakGene_diag"), "_", paste0(geneTypesSelected, collapse = "+"), filteredStr, ".pdf") |
|
874 |
- # if (!file.exists(filenameCur) | forceRerun) { |
|
875 |
- # |
|
876 |
- # } else { |
|
877 |
- # futile.logger::flog.info(paste0("File ", filenameCur, " already exists. As forceRerun = FALSE, not overwriting it.")) |
|
878 |
- # } |
|
879 |
- # |
|
880 |
- # } |
|
881 | 922 |
|
882 | 923 |
GRN |
883 | 924 |
|
... | ... |
@@ -891,6 +932,7 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
891 | 932 |
fileBase = NULL, |
892 | 933 |
pdf_width = 12, |
893 | 934 |
pdf_height = 12, |
935 |
+ pages = NULL, |
|
894 | 936 |
forceRerun = FALSE) { |
895 | 937 |
|
896 | 938 |
start = Sys.time() |
... | ... |
@@ -1005,8 +1047,7 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1005 | 1047 |
freq_class = gsub(freq_class, pattern = "random", replacement = "permuted") |
1006 | 1048 |
names(freq_class) <- names(freqs) |
1007 | 1049 |
|
1008 |
- |
|
1009 |
- |
|
1050 |
+ |
|
1010 | 1051 |
xlabels_peakGene_r.class = levels(peakGeneCorrelations.all$peak_gene.r.class) |
1011 | 1052 |
nCur = length(xlabels_peakGene_r.class) |
1012 | 1053 |
xlabels_peakGene_r.class[setdiff(seq_len(nCur), c(1, floor(nCur/2), nCur))] <- "" |
... | ... |
@@ -1020,9 +1061,14 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1020 | 1061 |
nCur = length(xlabels_peakGene_praw.class) |
1021 | 1062 |
xlabels_peakGene_praw.class[setdiff(seq_len(nCur), c(1, floor(nCur/2), nCur))] <- "" |
1022 | 1063 |
|
1023 |
- |
|
1064 |
+ # |
|
1065 |
+ # ITERATE THROUGH ALL GENE TYPES, WITH ONE PER PLOT |
|
1066 |
+ # |
|
1024 | 1067 |
for (geneTypesSelected in gene.types) { |
1025 | 1068 |
|
1069 |
+ # Reset page counter for each PDF anew |
|
1070 |
+ pageCounter = 1 |
|
1071 |
+ |
|
1026 | 1072 |
futile.logger::flog.info(paste0(" Gene type ", paste0(geneTypesSelected, collapse = "+"))) |
1027 | 1073 |
|
1028 | 1074 |
if ("all" %in% geneTypesSelected) { |
... | ... |
@@ -1064,116 +1110,120 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1064 | 1110 |
|
1065 | 1111 |
counter = counter + 1 |
1066 | 1112 |
|
1067 |
- if (length(allTF) > 1) { |
|
1068 |
- futile.logger::flog.info(paste0(" QC plots for TF ", TFCur, " (", counter, " of ", length(allTF), ")")) |
|
1069 |
- } |
|
1070 |
- |
|
1071 |
- |
|
1072 |
- if ("all" %in% geneTypesSelected) { |
|
1073 |
- indexCur = seq_len(nrow(peakGeneCorrelations.all)) |
|
1074 |
- } else { |
|
1075 |
- indexCur = which(peakGeneCorrelations.all$gene.type %in% geneTypesSelected) |
|
1076 |
- } |
|
1077 |
- |
|
1078 |
- |
|
1079 |
- if (TFCur != "all") { |
|
1080 |
- indexCur = intersect(indexCur, which(peakGeneCorrelations.all$peak.ID %in% TF.peaks[[TFCur]])) |
|
1113 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1114 |
+ |
|
1115 |
+ if (length(allTF) > 1) { |
|
1116 |
+ futile.logger::flog.info(paste0(" QC plots for TF ", TFCur, " (", counter, " of ", length(allTF), ")")) |
|
1117 |
+ } |
|
1118 |
+ |
|
1119 |
+ |
|
1120 |
+ if ("all" %in% geneTypesSelected) { |
|
1121 |
+ indexCur = seq_len(nrow(peakGeneCorrelations.all)) |
|
1122 |
+ } else { |
|
1123 |
+ indexCur = which(peakGeneCorrelations.all$gene.type %in% geneTypesSelected) |
|
1124 |
+ } |
|
1125 |
+ |
|
1126 |
+ |
|
1127 |
+ if (TFCur != "all") { |
|
1128 |
+ indexCur = intersect(indexCur, which(peakGeneCorrelations.all$peak.ID %in% TF.peaks[[TFCur]])) |
|
1129 |
+ } |
|
1130 |
+ |
|
1131 |
+ # Get subset also for just the real data |
|
1132 |
+ indexCurReal = intersect(indexCur, which(peakGeneCorrelations.all$class == names(dist_class)[1])) |
|
1133 |
+ |
|
1134 |
+ |
|
1135 |
+ xlabel = paste0("Correlation raw p-value") |
|
1136 |
+ |
|
1137 |
+ # DENSITY PLOTS P VALUE |
|
1138 |
+ |
|
1139 |
+ # TODO: Densities as ratio |
|
1140 |
+ # https://stackoverflow.com/questions/58629959/how-can-i-extract-data-from-a-kernel-density-function-in-r-for-many-samples-at-o#:~:text=To%20compute%20the%20density%20you,can%20use%20the%20package%20spatstat%20. |
|
1141 |
+ |
|
1142 |
+ # Produce the labels for the class-specific subtitles |
|
1143 |
+ customLabel_class = .customLabeler(table(peakGeneCorrelations.all[indexCur,]$class)) |
|
1144 |
+ |
|
1145 |
+ r_pos_freq = table(peakGeneCorrelations.all[indexCur,]$r_positive) |
|
1146 |
+ labeler_r_pos = labeller(r_positive = c("TRUE" = paste0("r positive (", r_pos_freq["TRUE"], ")"), |
|
1147 |
+ "FALSE" = paste0("r negative (", r_pos_freq["FALSE"], ")")) ) |
|
1148 |
+ theme_main = theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = rel(0.8)), |
|
1149 |
+ axis.text.y = element_text(size=rel(0.8)), |
|
1150 |
+ panel.grid.major = element_blank(), panel.grid.minor = element_blank()) |
|
1151 |
+ |
|
1152 |
+ |
|
1153 |
+ |
|
1154 |
+ ## p-val density curves stratified by real/permuted ## |
|
1155 |
+ |
|
1156 |
+ gA2 = ggplot(peakGeneCorrelations.all[indexCur,], aes(peak_gene.p_raw, color = r_positive)) + geom_density() + |
|
1157 |
+ facet_wrap(~ class, labeller = labeller(class=freq_class) ) + |
|
1158 |
+ xlab(xlabel) + ylab("Density") + theme_bw() + |
|
1159 |
+ scale_color_manual(labels = names(r_pos_class), values = r_pos_class) + |
|
1160 |
+ theme(legend.position = "none", axis.text=element_text(size=rel(0.6)), strip.text.x = element_text(size = rel(0.8))) |
|
1161 |
+ |
|
1162 |
+ # Helper function to retrieve all tables and data aggregation steps for subsequent visualization |
|
1163 |
+ tbl.l = .createTables_peakGeneQC(peakGeneCorrelations.all[indexCur,], networkType_details, colors_vec, range) |
|
1164 |
+ |
|
1165 |
+ |
|
1166 |
+ xlabel = paste0("Correlation raw\np-value (binned)") |
|
1167 |
+ |
|
1168 |
+ |
|
1169 |
+ xlabels = levels(tbl.l$d_merged$peak_gene.p.raw.class) |
|
1170 |
+ xlabels[setdiff(seq_len(length(xlabels)), c(1, floor(length(xlabels)/2), length(xlabels)))] <- "" |
|
1171 |
+ |
|
1172 |
+ gB3 = ggplot(tbl.l$d_merged, aes(peak_gene.p.raw.class, ratio, fill = classAll)) + |
|
1173 |
+ geom_bar(stat = "identity", position="dodge", na.rm = TRUE, width = 0.5) + |
|
1174 |
+ geom_hline(yintercept = 1, linetype = "dotted") + |
|
1175 |
+ xlab(xlabel) + ylab("Ratio") + |
|
1176 |
+ scale_fill_manual("Class", values = c(dist_class, r_pos_class), |
|
1177 |
+ labels = c("real", "permuted", "r+ (r>0)", "r- (r<=0)"), |
|
1178 |
+ ) + # labels vector can be kind of manually specified here because the levels were previosly defined in a certain order |
|
1179 |
+ scale_x_discrete(labels = xlabels_peakGene_praw.class) + |
|
1180 |
+ theme_bw() + |
|
1181 |
+ #theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank()) + |
|
1182 |
+ # theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8) , axis.title.y = element_blank()) + |
|
1183 |
+ theme_main + |
|
1184 |
+ facet_wrap(~ factor(set), nrow = 2, scales = "free_y", strip.position = "left") |
|
1185 |
+ |
|
1186 |
+ # plot two FDR plots as well: (fraction of negative / negative+positive) and (fraction of permuted / permuted + real) |
|
1187 |
+ # that could give an indication of whether an FDR based on the permuted or based on the negative correlations would be more stringent |
|
1188 |
+ |
|
1189 |
+ # R PEAK GENE # |
|
1190 |
+ |
|
1191 |
+ xlabel = paste0("Correlation coefficient r") |
|
1192 |
+ |
|
1193 |
+ sum_real = table(peakGeneCorrelations.all[indexCur,]$class)[names(dist_class)[1]] |
|
1194 |
+ sum_rnd = table(peakGeneCorrelations.all[indexCur,]$class)[names(dist_class)[2]] |
|
1195 |
+ binData.r = peakGeneCorrelations.all[indexCur,] %>% |
|
1196 |
+ dplyr::group_by(class) %>% |
|
1197 |
+ dplyr::count(peak_gene.r.class) %>% |
|
1198 |
+ dplyr::mutate(nnorm = dplyr::case_when(class == !! (names(dist_class)[1]) ~ .data$n / (sum_real / sum_rnd), |
|
1199 |
+ TRUE ~ as.double(.data$n))) |
|
1200 |
+ |
|
1201 |
+ xlabel = paste0("Correlation coefficient r (binned)") |
|
1202 |
+ |
|
1203 |
+ gD = ggplot(binData.r, aes(peak_gene.r.class, nnorm, group = class, fill = class)) + |
|
1204 |
+ geom_bar(stat = "identity", position = position_dodge(preserve = "single"), na.rm = FALSE, width = 0.5) + |
|
1205 |
+ geom_line(aes(peak_gene.r.class, nnorm, group = class, color= class), stat = "identity") + |
|
1206 |
+ scale_fill_manual("Group", labels = names(dist_class), values = dist_class) + |
|
1207 |
+ scale_color_manual("Group", labels = names(dist_class), values = dist_class) + |
|
1208 |
+ scale_x_discrete(labels = xlabels_peakGene_r.class2, drop = FALSE) + |
|
1209 |
+ theme_bw() + theme(legend.position = "none") + |
|
1210 |
+ xlab(xlabel) + ylab("Abundance") + |
|
1211 |
+ theme_main + |
|
1212 |
+ scale_y_continuous(labels = scales::scientific) |
|
1213 |
+ |
|
1214 |
+ |
|
1215 |
+ mainTitle = paste0("Summary QC (TF: ", TFCur, ", gene type: ", paste0(geneTypesSelected, collapse = "+"), ",\n", .prettyNum(range), " bp promoter range)") |
|
1216 |
+ |
|
1217 |
+ plots_all = ( ((gA2 | gB3 ) + |
|
1218 |
+ patchwork::plot_layout(widths = c(2.5,1.5))) / ((gD) + |
|
1219 |
+ patchwork::plot_layout(widths = c(4))) ) + |
|
1220 |
+ patchwork::plot_layout(heights = c(2,1), guides = 'collect') + |
|
1221 |
+ patchwork::plot_annotation(title = mainTitle, theme = theme(plot.title = element_text(hjust = 0.5))) |
|
1222 |
+ |
|
1223 |
+ plot(plots_all) |
|
1081 | 1224 |
} |
1082 |
- |
|
1083 |
- # Get subset also for just the real data |
|
1084 |
- indexCurReal = intersect(indexCur, which(peakGeneCorrelations.all$class == names(dist_class)[1])) |
|
1085 |
- |
|
1086 |
- |
|
1087 |
- xlabel = paste0("Correlation raw p-value") |
|
1088 |
- |
|
1089 |
- # DENSITY PLOTS P VALUE |
|
1090 |
- |
|
1091 |
- # TODO: Densities as ratio |
|
1092 |
- # https://stackoverflow.com/questions/58629959/how-can-i-extract-data-from-a-kernel-density-function-in-r-for-many-samples-at-o#:~:text=To%20compute%20the%20density%20you,can%20use%20the%20package%20spatstat%20. |
|
1093 |
- |
|
1094 |
- # Produce the labels for the class-specific subtitles |
|
1095 |
- customLabel_class = .customLabeler(table(peakGeneCorrelations.all[indexCur,]$class)) |
|
1096 |
- |
|
1097 |
- r_pos_freq = table(peakGeneCorrelations.all[indexCur,]$r_positive) |
|
1098 |
- labeler_r_pos = labeller(r_positive = c("TRUE" = paste0("r positive (", r_pos_freq["TRUE"], ")"), |
|
1099 |
- "FALSE" = paste0("r negative (", r_pos_freq["FALSE"], ")")) ) |
|
1100 |
- theme_main = theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = rel(0.8)), |
|
1101 |
- axis.text.y = element_text(size=rel(0.8)), |
|
1102 |
- panel.grid.major = element_blank(), panel.grid.minor = element_blank()) |
|
1103 |
- |
|
1104 |
- |
|
1105 |
- |
|
1106 |
- ## p-val density curves stratified by real/permuted ## |
|
1107 |
- |
|
1108 |
- gA2 = ggplot(peakGeneCorrelations.all[indexCur,], aes(peak_gene.p_raw, color = r_positive)) + geom_density() + |
|
1109 |
- facet_wrap(~ class, labeller = labeller(class=freq_class) ) + |
|
1110 |
- xlab(xlabel) + ylab("Density") + theme_bw() + |
|
1111 |
- scale_color_manual(labels = names(r_pos_class), values = r_pos_class) + |
|
1112 |
- theme(legend.position = "none", axis.text=element_text(size=rel(0.6)), strip.text.x = element_text(size = rel(0.8))) |
|
1113 |
- |
|
1114 |
- # Helper function to retrieve all tables and data aggregation steps for subsequent visualization |
|
1115 |
- tbl.l = .createTables_peakGeneQC(peakGeneCorrelations.all[indexCur,], networkType_details, colors_vec, range) |
|
1116 |
- |
|
1117 |
- |
|
1118 |
- xlabel = paste0("Correlation raw\np-value (binned)") |
|
1119 |
- |
|
1120 |
- |
|
1121 |
- xlabels = levels(tbl.l$d_merged$peak_gene.p.raw.class) |
|
1122 |
- xlabels[setdiff(seq_len(length(xlabels)), c(1, floor(length(xlabels)/2), length(xlabels)))] <- "" |
|
1123 |
- |
|
1124 |
- gB3 = ggplot(tbl.l$d_merged, aes(peak_gene.p.raw.class, ratio, fill = classAll)) + |
|
1125 |
- geom_bar(stat = "identity", position="dodge", na.rm = TRUE, width = 0.5) + |
|
1126 |
- geom_hline(yintercept = 1, linetype = "dotted") + |
|
1127 |
- xlab(xlabel) + ylab("Ratio") + |
|
1128 |
- scale_fill_manual("Class", values = c(dist_class, r_pos_class), |
|
1129 |
- labels = c("real", "permuted", "r+ (r>0)", "r- (r<=0)"), |
|
1130 |
- ) + # labels vector can be kind of manually specified here because the levels were previosly defined in a certain order |
|
1131 |
- scale_x_discrete(labels = xlabels_peakGene_praw.class) + |
|
1132 |
- theme_bw() + |
|
1133 |
- #theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank()) + |
|
1134 |
- # theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8) , axis.title.y = element_blank()) + |
|
1135 |
- theme_main + |
|
1136 |
- facet_wrap(~ factor(set), nrow = 2, scales = "free_y", strip.position = "left") |
|
1137 |
- |
|
1138 |
- # plot two FDR plots as well: (fraction of negative / negative+positive) and (fraction of permuted / permuted + real) |
|
1139 |
- # that could give an indication of whether an FDR based on the permuted or based on the negative correlations would be more stringent |
|
1140 |
- |
|
1141 |
- # R PEAK GENE # |
|
1142 |
- |
|
1143 |
- xlabel = paste0("Correlation coefficient r") |
|
1144 |
- |
|
1145 |
- sum_real = table(peakGeneCorrelations.all[indexCur,]$class)[names(dist_class)[1]] |
|
1146 |
- sum_rnd = table(peakGeneCorrelations.all[indexCur,]$class)[names(dist_class)[2]] |
|
1147 |
- binData.r = peakGeneCorrelations.all[indexCur,] %>% |
|
1148 |
- dplyr::group_by(class) %>% |
|
1149 |
- dplyr::count(peak_gene.r.class) %>% |
|
1150 |
- dplyr::mutate(nnorm = dplyr::case_when(class == !! (names(dist_class)[1]) ~ .data$n / (sum_real / sum_rnd), |
|
1151 |
- TRUE ~ as.double(.data$n))) |
|
1152 |
- |
|
1153 |
- xlabel = paste0("Correlation coefficient r (binned)") |
|
1154 |
- |
|
1155 |
- gD = ggplot(binData.r, aes(peak_gene.r.class, nnorm, group = class, fill = class)) + |
|
1156 |
- geom_bar(stat = "identity", position = position_dodge(preserve = "single"), na.rm = FALSE, width = 0.5) + |
|
1157 |
- geom_line(aes(peak_gene.r.class, nnorm, group = class, color= class), stat = "identity") + |
|
1158 |
- scale_fill_manual("Group", labels = names(dist_class), values = dist_class) + |
|
1159 |
- scale_color_manual("Group", labels = names(dist_class), values = dist_class) + |
|
1160 |
- scale_x_discrete(labels = xlabels_peakGene_r.class2, drop = FALSE) + |
|
1161 |
- theme_bw() + theme(legend.position = "none") + |
|
1162 |
- xlab(xlabel) + ylab("Abundance") + |
|
1163 |
- theme_main + |
|
1164 |
- scale_y_continuous(labels = scales::scientific) |
|
1165 |
- |
|
1166 |
- |
|
1167 |
- mainTitle = paste0("Summary QC (TF: ", TFCur, ", gene type: ", paste0(geneTypesSelected, collapse = "+"), ",\n", .prettyNum(range), " bp promoter range)") |
|
1168 |
- |
|
1169 |
- plots_all = ( ((gA2 | gB3 ) + |
|
1170 |
- patchwork::plot_layout(widths = c(2.5,1.5))) / ((gD) + |
|
1171 |
- patchwork::plot_layout(widths = c(4))) ) + |
|
1172 |
- patchwork::plot_layout(heights = c(2,1), guides = 'collect') + |
|
1173 |
- patchwork::plot_annotation(title = mainTitle, theme = theme(plot.title = element_text(hjust = 0.5))) |
|
1174 |
- |
|
1175 |
- plot(plots_all) |
|
1176 |
- |
|
1225 |
+ pageCounter = pageCounter + 1 |
|
1226 |
+ |
|
1177 | 1227 |
|
1178 | 1228 |
} # end for each TF |
1179 | 1229 |
|
... | ... |
@@ -1183,9 +1233,18 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1183 | 1233 |
#plot(ChIPseeker::plotAnnoBar(GRN@annotation$consensusPeaks_obj)) |
1184 | 1234 |
|
1185 | 1235 |
# no plot, as this is somehow just a list and no ggplot object |
1186 |
- ChIPseeker::plotAnnoPie(GRN@annotation$consensusPeaks_obj, |
|
1187 |
- main = paste0("\nPeak annotation BEFORE filtering (n = ", nrow(GRN@annotation$consensusPeaks), ")")) |
|
1188 |
- plot(ChIPseeker::plotDistToTSS(GRN@annotation$consensusPeaks_obj)) |
|
1236 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1237 |
+ |
|
1238 |
+ ChIPseeker::plotAnnoPie(GRN@annotation$consensusPeaks_obj, |
|
1239 |
+ main = paste0("\nPeak annotation BEFORE filtering (n = ", nrow(GRN@annotation$consensusPeaks), ")")) |
|
1240 |
+ } |
|
1241 |
+ pageCounter = pageCounter + 1 |
|
1242 |
+ |
|
1243 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1244 |
+ plot(ChIPseeker::plotDistToTSS(GRN@annotation$consensusPeaks_obj)) |
|
1245 |
+ } |
|
1246 |
+ |
|
1247 |
+ pageCounter = pageCounter + 1 |
|
1189 | 1248 |
} |
1190 | 1249 |
|
1191 | 1250 |
# PEAK AND GENE PROPERTIES # |
... | ... |
@@ -1288,101 +1347,99 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1288 | 1347 |
r_positive_label = c("TRUE" = paste0("r+(r>0, n=", .prettyNum(r_pos_tbl[["TRUE"]]), ")"), |
1289 | 1348 |
"FALSE" = paste0("r-(r<=0, n=" ,.prettyNum(r_pos_tbl[["FALSE"]]), ")")) |
1290 | 1349 |
|
1291 |
- # Class in the facet_wrap has been removed, as this is only for real data here |
|
1292 |
- gA3 = ggplot(dataCur, aes(peak_gene.p_raw, color = .data[[newColName]])) + geom_density(size = 0.2) + |
|
1293 |
- facet_wrap(~r_positive, labeller = labeller(r_positive = r_positive_label), nrow = 2) + |
|
1294 |
- geom_density(aes(color = classNew), color = "black", linetype = "dotted", alpha = 1) + |
|
1295 |
- xlab(xlabel) + ylab("Density (for real data only)") + theme_bw() + |
|
1296 |
- scale_color_manual(newColName, values = mycolors, labels = var.label, drop = FALSE ) + |
|
1297 |
- theme(axis.text=element_text(size=rel(0.6)), strip.text.x = element_text(size = rel(0.8)), |
|
1298 |
- legend.text=element_text(size=rel(0.6)), legend.position = "none") |
|
1299 |
- |
|
1300 |
- # The following causes an error with patchwork |
|
1301 |
- # gA3 = ggplot(dataCur, aes(peak_gene.p_raw, color = .data[[newColName]])) + geom_density(size = 0.2) + |
|
1302 |
- # facet_wrap(~ class + r_positive, labeller = labeller(class=freq_class, r_positive = r_positive_label), nrow = 2) + |
|
1303 |
- # geom_density(aes(color = classNew), color = "black", linetype = "dotted", alpha = 1) + |
|
1304 |
- # xlab(xlabel) + ylab("Density") + theme_bw() + |
|
1305 |
- # scale_color_manual(newColName, values = mycolors, labels = var.label, drop = FALSE ) + |
|
1306 |
- # theme(axis.text=element_text(size=rel(0.6)), strip.text.x = element_text(size = rel(0.8)), |
|
1307 |
- # legend.text=element_text(size=rel(0.6)), legend.position = "none") |
|
1308 |
- |
|
1309 |
- |
|
1310 |
- # Ratios for r+ / r- |
|
1311 |
- freq = dataCur %>% |
|
1312 |
- dplyr::group_by(class, .data[[newColName]], peak_gene.p.raw.class, r_positive) %>% |
|
1313 |
- dplyr::summarise(n = dplyr::n()) %>% |
|
1314 |
- dplyr::ungroup() %>% |
|
1315 |
- tidyr::complete(class, .data[[newColName]], peak_gene.p.raw.class, r_positive, fill = list(n = 0)) %>% # SOme cases might be missing |
|
1316 |
- dplyr::group_by(class, .data[[newColName]], peak_gene.p.raw.class) %>% # dont group by r_positive because we want to calculate the ratio within each group |
|
1317 |
- dplyr::mutate( |
|
1318 |
- n = .data$n + 1, # to allow ratios to be computed even for 0 counts |
|
1319 |
- ratio_pos_raw = .data$n[r_positive] / .data$n[!r_positive]) %>% |
|
1320 |
- dplyr::filter(r_positive, class == names(dist_class)[1])# Keep only one r_positive row per grouping as we operate via the ratio and this data is duplicated otherwise. Remove random data also because these have been filtered out before are only back due to the complete invokation. |
|
1321 |
- |
|
1322 |
- # Cap ratios > 10 at 10 to avoid visual artefacts |
|
1323 |
- freq$ratio_pos_raw[which(freq$ratio_pos_raw > 10)] = 10 |
|
1324 |
- |
|
1325 |
- # Without proper colors for now, this will be added after the next plot |
|
1326 |
- gB3 = ggplot(freq, aes(peak_gene.p.raw.class, ratio_pos_raw, fill = .data[[newColName]])) + |
|
1327 |
- geom_bar(stat = "identity", position="dodge", na.rm = TRUE, width = 0.8) + |
|
1328 |
- geom_hline(yintercept = 1, linetype = "dotted") + |
|
1329 |
- xlab(xlabel) + ylab("Ratio r+ / r- (capped at 10)") + |
|
1330 |
- scale_x_discrete(labels = xlabels_peakGene_praw.class) + |
|
1331 |
- scale_fill_manual (varCur, values = mycolors, labels = var.label, drop = FALSE ) + |
|
1332 |
- theme_bw() + |
|
1333 |
- #theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank()) + |
|
1334 |
- # theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8) , axis.title.y = element_blank()) + |
|
1335 |
- theme_main + |
|
1336 |
- facet_wrap(~ factor(class), nrow = 2, scales = "free_y", strip.position = "left", labeller = labeller(class=freq_class)) |
|
1337 |
- |
|
1338 |
- |
|
1339 |
- plots_all = ( ((gA3 | gB3 ) + |
|
1340 |
- patchwork::plot_layout(widths = c(2.5,1.5))) ) + |
|
1341 |
- patchwork::plot_layout(guides = 'collect') + |
|
1342 |
- patchwork::plot_annotation(title = mainTitle, theme = theme(plot.title = element_text(hjust = 0.5))) |
|
1343 |
- |
|
1344 |
- plot(plots_all) |
|
1345 |
- |
|
1346 |
- |
|
1347 |
- # VERSION JUDITH: simplified peak.gene distance + another variable as histogram, no pemuted data |
|
1348 |
- |
|
1349 |
- datasetName = "" |
|
1350 |
- if (!is.null(GRN@config$metadata$name)) { |
|
1351 |
- datasetName = GRN@config$metadata$name |
|
1350 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1351 |
+ |
|
1352 |
+ |
|
1353 |
+ # Class in the facet_wrap has been removed, as this is only for real data here |
|
1354 |
+ gA3 = ggplot(dataCur, aes(peak_gene.p_raw, color = .data[[newColName]])) + geom_density(size = 0.2) + |
|
1355 |
+ facet_wrap(~r_positive, labeller = labeller(r_positive = r_positive_label), nrow = 2) + |
|
1356 |
+ geom_density(aes(color = classNew), color = "black", linetype = "dotted", alpha = 1) + |
|
1357 |
+ xlab(xlabel) + ylab("Density (for real data only)") + theme_bw() + |
|
1358 |
+ scale_color_manual(newColName, values = mycolors, labels = var.label, drop = FALSE ) + |
|
1359 |
+ theme(axis.text=element_text(size=rel(0.6)), strip.text.x = element_text(size = rel(0.8)), |
|
1360 |
+ legend.text=element_text(size=rel(0.6)), legend.position = "none") |
|
1361 |
+ |
|
1362 |
+ # Ratios for r+ / r- |
|
1363 |
+ freq = dataCur %>% |
|
1364 |
+ dplyr::group_by(class, .data[[newColName]], peak_gene.p.raw.class, r_positive) %>% |
|
1365 |
+ dplyr::summarise(n = dplyr::n()) %>% |
|
1366 |
+ dplyr::ungroup() %>% |
|
1367 |
+ tidyr::complete(class, .data[[newColName]], peak_gene.p.raw.class, r_positive, fill = list(n = 0)) %>% # SOme cases might be missing |
|
1368 |
+ dplyr::group_by(class, .data[[newColName]], peak_gene.p.raw.class) %>% # dont group by r_positive because we want to calculate the ratio within each group |
|
1369 |
+ dplyr::mutate( |
|
1370 |
+ n = .data$n + 1, # to allow ratios to be computed even for 0 counts |
|
1371 |
+ ratio_pos_raw = .data$n[r_positive] / .data$n[!r_positive]) %>% |
|
1372 |
+ dplyr::filter(r_positive, class == names(dist_class)[1])# Keep only one r_positive row per grouping as we operate via the ratio and this data is duplicated otherwise. Remove random data also because these have been filtered out before are only back due to the complete invokation. |
|
1373 |
+ |
|
1374 |
+ # Cap ratios > 10 at 10 to avoid visual artefacts |
|
1375 |
+ freq$ratio_pos_raw[which(freq$ratio_pos_raw > 10)] = 10 |
|
1376 |
+ |
|
1377 |
+ # Without proper colors for now, this will be added after the next plot |
|
1378 |
+ gB3 = ggplot(freq, aes(peak_gene.p.raw.class, ratio_pos_raw, fill = .data[[newColName]])) + |
|
1379 |
+ geom_bar(stat = "identity", position="dodge", na.rm = TRUE, width = 0.8) + |
|
1380 |
+ geom_hline(yintercept = 1, linetype = "dotted") + |
|
1381 |
+ xlab(xlabel) + ylab("Ratio r+ / r- (capped at 10)") + |
|
1382 |
+ scale_x_discrete(labels = xlabels_peakGene_praw.class) + |
|
1383 |
+ scale_fill_manual (varCur, values = mycolors, labels = var.label, drop = FALSE ) + |
|
1384 |
+ theme_bw() + |
|
1385 |
+ #theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), strip.background = element_blank(), strip.placement = "outside", axis.title.y = element_blank()) + |
|
1386 |
+ # theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8) , axis.title.y = element_blank()) + |
|
1387 |
+ theme_main + |
|
1388 |
+ facet_wrap(~ factor(class), nrow = 2, scales = "free_y", strip.position = "left", labeller = labeller(class=freq_class)) |
|
1389 |
+ |
|
1390 |
+ |
|
1391 |
+ plots_all = ( ((gA3 | gB3 ) + |
|
1392 |
+ patchwork::plot_layout(widths = c(2.5,1.5))) ) + |
|
1393 |
+ patchwork::plot_layout(guides = 'collect') + |
|
1394 |
+ patchwork::plot_annotation(title = mainTitle, theme = theme(plot.title = element_text(hjust = 0.5))) |
|
1395 |
+ |
|
1396 |
+ plot(plots_all) |
|
1352 | 1397 |
} |
1353 |
- |
|
1354 |
- mainTitle2 = paste0(datasetName, "\nSummary QC (TF: ", TFCur, ", gene type: ", paste0(geneTypesSelected, collapse = "+"), ", ", |
|
1355 |
- .prettyNum(range), " bp promoter range, stratified by distance + ", varCur, ")") |
|
1356 |
- |
|
1357 |
- # r+ and r- |
|
1358 |
- binwidth = 0.1 |
|
1359 |
- mycolors <- viridis::viridis(2) |
|
1360 |
- xlabel = paste0("Correlation raw p-value (", binwidth, " bins)") |
|
1361 |
- |
|
1362 |
- dataCur = dataCur %>% |
|
1363 |
- dplyr::mutate(peak_gene.distance.class250k = factor(dplyr::if_else(peak_gene.distance <= 250000, "<=250k", ">250k"))) %>% |
|
1364 |
- dplyr::select(-peak_gene.distance) |
|
1365 |
- |
|
1366 |
- # closed = "left", boundary = 0, ensure correct numbers. See https://github.com/tidyverse/ggplot2/issues/1739 |
|
1367 |
- # Without boundary = 0, counts are actually wrong |
|
1398 |
+ pageCounter = pageCounter + 1 |
|
1368 | 1399 |
|
1369 | 1400 |
|
1370 |
- nrows_plot = 2 |
|
1371 |
- if (length(unique(dataCur[[newColName]])) > 9) { |
|
1372 |
- nrows_plot = 3 |
|
1401 |
+ # VERSION JUDITH: simplified peak.gene distance + another variable as histogram, no permuted data |
|
1402 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1403 |
+ |
|
1404 |
+ datasetName = "" |
|
1405 |
+ if (!is.null(GRN@config$metadata$name)) { |
|
1406 |
+ datasetName = GRN@config$metadata$name |
|
1407 |
+ } |
|
1408 |
+ |
|
1409 |
+ mainTitle2 = paste0(datasetName, "\nSummary QC (TF: ", TFCur, ", gene type: ", paste0(geneTypesSelected, collapse = "+"), ", ", |
|
1410 |
+ .prettyNum(range), " bp promoter range, stratified by distance + ", varCur, ")") |
|
1411 |
+ |
|
1412 |
+ # r+ and r- |
|
1413 |
+ binwidth = 0.1 |
|
1414 |
+ mycolors <- viridis::viridis(2) |
|
1415 |
+ xlabel = paste0("Correlation raw p-value (", binwidth, " bins)") |
|
1416 |
+ |
|
1417 |
+ dataCur = dataCur %>% |
|
1418 |
+ dplyr::mutate(peak_gene.distance.class250k = factor(dplyr::if_else(peak_gene.distance <= 250000, "<=250k", ">250k"))) %>% |
|
1419 |
+ dplyr::select(-peak_gene.distance) |
|
1420 |
+ |
|
1421 |
+ # closed = "left", boundary = 0, ensure correct numbers. See https://github.com/tidyverse/ggplot2/issues/1739 |
|
1422 |
+ # Without boundary = 0, counts are actually wrong |
|
1423 |
+ |
|
1424 |
+ |
|
1425 |
+ nrows_plot = 2 |
|
1426 |
+ if (length(unique(dataCur[[newColName]])) > 9) { |
|
1427 |
+ nrows_plot = 3 |
|
1428 |
+ } |
|
1429 |
+ |
|
1430 |
+ gA5 = ggplot(dataCur, aes(peak_gene.p_raw, fill = r_positive)) + |
|
1431 |
+ geom_histogram(binwidth = binwidth, position="dodge", closed = "left", boundary = 0) + |
|
1432 |
+ facet_wrap(~ peak_gene.distance.class250k + .data[[newColName]], nrow = nrows_plot, scales = "free_y") + |
|
1433 |
+ xlab(xlabel) + ylab(paste0("Abundance for classes with n>=", nGroupsMin)) + theme_bw() + |
|
1434 |
+ ggtitle(mainTitle2) + |
|
1435 |
+ scale_fill_manual("Class for r", values = mycolors, labels = r_positive_label, drop = FALSE ) + |
|
1436 |
+ theme(axis.text=element_text(size=rel(0.6)), strip.text.x = element_text(size = rel(0.6)), |
|
1437 |
+ legend.text=element_text(size=rel(0.7))) |
|
1438 |
+ |
|
1439 |
+ |
|
1440 |
+ plot(gA5) |
|
1373 | 1441 |
} |
1374 |
- |
|
1375 |
- gA5 = ggplot(dataCur, aes(peak_gene.p_raw, fill = r_positive)) + |
|
1376 |
- geom_histogram(binwidth = binwidth, position="dodge", closed = "left", boundary = 0) + |
|
1377 |
- facet_wrap(~ peak_gene.distance.class250k + .data[[newColName]], nrow = nrows_plot, scales = "free_y") + |
|
1378 |
- xlab(xlabel) + ylab(paste0("Abundance for classes with n>=", nGroupsMin)) + theme_bw() + |
|
1379 |
- ggtitle(mainTitle2) + |
|
1380 |
- scale_fill_manual("Class for r", values = mycolors, labels = r_positive_label, drop = FALSE ) + |
|
1381 |
- theme(axis.text=element_text(size=rel(0.6)), strip.text.x = element_text(size = rel(0.6)), |
|
1382 |
- legend.text=element_text(size=rel(0.7))) |
|
1383 |
- |
|
1384 |
- |
|
1385 |
- plot(gA5) |
|
1442 |
+ pageCounter = pageCounter + 1 |
|
1386 | 1443 |
|
1387 | 1444 |
|
1388 | 1445 |
|
... | ... |
@@ -1401,21 +1458,29 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1401 | 1458 |
indexFilt = intersect(indexFilt, indexCur) |
1402 | 1459 |
|
1403 | 1460 |
if (length(indexFilt) > 0) { |
1404 |
- g = ggplot(peakGeneCorrelations.all[indexFilt,], aes(peak_gene.p_raw, color = peak_gene.distance_class_abs)) + geom_density() + |
|
1405 |
- ggtitle(paste0("Density of the raw p-value distributions")) + |
|
1406 |
- facet_wrap(~ r_positive, ncol = 2, labeller = labeler_r_pos) + |
|
1407 |
- scale_color_viridis_d(labels = .classFreq_label(table(peakGeneCorrelations.all[indexFilt,]$peak_gene.distance_class_abs))) + |
|
1408 |
- theme_bw() |
|
1409 |
- plot(g) |
|
1410 |
- |
|
1461 |
+ |
|
1462 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1463 |
+ g = ggplot(peakGeneCorrelations.all[indexFilt,], aes(peak_gene.p_raw, color = peak_gene.distance_class_abs)) + geom_density() + |
|
1464 |
+ ggtitle(paste0("Density of the raw p-value distributions")) + |
|
1465 |
+ facet_wrap(~ r_positive, ncol = 2, labeller = labeler_r_pos) + |
|
1466 |
+ scale_color_viridis_d(labels = .classFreq_label(table(peakGeneCorrelations.all[indexFilt,]$peak_gene.distance_class_abs))) + |
|
1467 |
+ theme_bw() |
|
1468 |
+ plot(g) |
|
1469 |
+ } |
|
1470 |
+ pageCounter = pageCounter + 1 |
|
1411 | 1471 |
|
1412 | 1472 |
if (includeRobustCols) { |
1413 |
- g = ggplot(peakGeneCorrelations.all[indexFilt,], aes(peak_gene.p_raw.robust, color = peak_gene.distance_class_abs)) + geom_density() + |
|
1414 |
- ggtitle(paste0("Density of the raw p-value distributions\n(stratified by whether r is positive)")) + |
|
1415 |
- facet_wrap(~ r_positive, ncol = 2, labeller = labeler_r_pos) + |
|
1416 |
- scale_color_viridis_d(labels = .classFreq_label(table(peakGeneCorrelations.all[indexFilt,]$peak_gene.distance_class_abs))) + |
|
1417 |
- theme_bw() |
|
1418 |
- plot(g) |
|
1473 |
+ |
|
1474 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1475 |
+ |
|
1476 |
+ g = ggplot(peakGeneCorrelations.all[indexFilt,], aes(peak_gene.p_raw.robust, color = peak_gene.distance_class_abs)) + geom_density() + |
|
1477 |
+ ggtitle(paste0("Density of the raw p-value distributions\n(stratified by whether r is positive)")) + |
|
1478 |
+ facet_wrap(~ r_positive, ncol = 2, labeller = labeler_r_pos) + |
|
1479 |
+ scale_color_viridis_d(labels = .classFreq_label(table(peakGeneCorrelations.all[indexFilt,]$peak_gene.distance_class_abs))) + |
|
1480 |
+ theme_bw() |
|
1481 |
+ plot(g) |
|
1482 |
+ } |
|
1483 |
+ pageCounter = pageCounter + 1 |
|
1419 | 1484 |
|
1420 | 1485 |
|
1421 | 1486 |
} |
... | ... |
@@ -1426,13 +1491,17 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1426 | 1491 |
|
1427 | 1492 |
|
1428 | 1493 |
if (length(indexFilt) > 0) { |
1429 |
- g = ggplot(peakGeneCorrelations.all[indexFilt,], aes(peak_gene.p_raw, color = r_positive)) + |
|
1430 |
- geom_density() + |
|
1431 |
- ggtitle(paste0("Density of the raw p-value distributions")) + |
|
1432 |
- facet_wrap(~ peak_gene.distance_class_abs, ncol = 2, labeller = .customLabeler(table(peakGeneCorrelations.all$peak_gene.distance_class_abs))) + |
|
1433 |
- scale_color_manual(labels = .classFreq_label(table(peakGeneCorrelations.all[indexFilt,]$r_positive)), values = r_pos_class) + |
|
1434 |
- theme_bw() |
|
1435 |
- plot(g) |
|
1494 |
+ |
|
1495 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1496 |
+ g = ggplot(peakGeneCorrelations.all[indexFilt,], aes(peak_gene.p_raw, color = r_positive)) + |
|
1497 |
+ geom_density() + |
|
1498 |
+ ggtitle(paste0("Density of the raw p-value distributions")) + |
|
1499 |
+ facet_wrap(~ peak_gene.distance_class_abs, ncol = 2, labeller = .customLabeler(table(peakGeneCorrelations.all$peak_gene.distance_class_abs))) + |
|
1500 |
+ scale_color_manual(labels = .classFreq_label(table(peakGeneCorrelations.all[indexFilt,]$r_positive)), values = r_pos_class) + |
|
1501 |
+ theme_bw() |
|
1502 |
+ plot(g) |
|
1503 |
+ } |
|
1504 |
+ pageCounter = pageCounter + 1 |
|
1436 | 1505 |
|
1437 | 1506 |
|
1438 | 1507 |
} |
... | ... |
@@ -1441,15 +1510,18 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1441 | 1510 |
# |
1442 | 1511 |
# Focus on peak_gene.r |
1443 | 1512 |
# |
1513 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1514 |
+ |
|
1515 |
+ g = ggplot(peakGeneCorrelations.all[indexCur,], aes(peak_gene.r, color = peak_gene.distance_class_abs)) + geom_density() + |
|
1516 |
+ geom_density(data = peakGeneCorrelations.all[indexCur,], aes(peak_gene.r), color = "black") + |
|
1517 |
+ ggtitle(paste0("Density of the correlation coefficients")) + |
|
1518 |
+ scale_color_viridis_d(labels = .classFreq_label(table(peakGeneCorrelations.all[indexCur,]$peak_gene.distance_class_abs))) + |
|
1519 |
+ theme_bw() |
|
1520 |
+ plot(g) |
|
1521 |
+ } |
|
1522 |
+ pageCounter = pageCounter + 1 |
|
1444 | 1523 |
|
1445 |
- g = ggplot(peakGeneCorrelations.all[indexCur,], aes(peak_gene.r, color = peak_gene.distance_class_abs)) + geom_density() + |
|
1446 |
- geom_density(data = peakGeneCorrelations.all[indexCur,], aes(peak_gene.r), color = "black") + |
|
1447 |
- ggtitle(paste0("Density of the correlation coefficients")) + |
|
1448 |
- scale_color_viridis_d(labels = .classFreq_label(table(peakGeneCorrelations.all[indexCur,]$peak_gene.distance_class_abs))) + |
|
1449 |
- theme_bw() |
|
1450 |
- plot(g) |
|
1451 |
- |
|
1452 |
- |
|
1524 |
+ .printWarningPageNumber(pages, pageCounter) |
|
1453 | 1525 |
if (!is.null(fileBase)) { |
1454 | 1526 |
grDevices::dev.off() |
1455 | 1527 |
} |
... | ... |
@@ -1468,108 +1540,6 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1468 | 1540 |
|
1469 | 1541 |
} |
1470 | 1542 |
|
1471 |
-.createTables_peakGeneQC <- function(peakGeneCorrelations.all.cur, networkType_details, colors_vec, range) { |
|
1472 |
- |
|
1473 |
- d = peakGeneCorrelations.all.cur %>% |
|
1474 |
- dplyr::group_by(r_positive, class, peak_gene.p.raw.class) %>% |
|
1475 |
- dplyr::summarise(n = dplyr::n()) %>% |
|
1476 |
- dplyr::ungroup() |
|
1477 |
- |
|
1478 |
- # Some classes might be missing, add them here with explicit zeros |
|
1479 |
- for (r_pos in c(TRUE, FALSE)) { |
|
1480 |
- for (classCur in networkType_details){ |
|
1481 |
- for (pclassCur in levels(peakGeneCorrelations.all.cur$peak_gene.p.raw.class)) { |
|
1482 |
- |
|
1483 |
- row = which(d$r_positive == r_pos & d$class == classCur & as.character(d$peak_gene.p.raw.class) == as.character(pclassCur)) |
|
1484 |
- if (length(row) == 0) { |
|
1485 |
- d = tibble::add_row(d, r_positive = r_pos, class = classCur, peak_gene.p.raw.class = pclassCur, n = 0) |
|
1486 |
- } |
|
1487 |
- } |
|
1488 |
- } |
|
1489 |
- } |
|
1490 |
- |
|
1491 |
- # Restore the "ordered" factor for class |
|
1492 |
- d$peak_gene.p.raw.class = factor(d$peak_gene.p.raw.class, ordered = TRUE, levels = levels(peakGeneCorrelations.all.cur$peak_gene.p.raw.class)) |
|
1493 |
- |
|
1494 |
- |
|
1495 |
- # Normalization factors |
|
1496 |
- dsum = d %>% |
|
1497 |
- dplyr::group_by(r_positive, class) %>% |
|
1498 |
- dplyr::summarise(sum_n = sum(.data$n)) |
|
1499 |
- |
|
1500 |
- |
|
1501 |
- # Summarize per bin |
|
1502 |
- d2 = d %>% |
|
1503 |
- dplyr::group_by(class, peak_gene.p.raw.class)%>% |
|
1504 |
- dplyr::summarise(sum_pos = .data$n[r_positive], |
|
1505 |
- sum_neg = .data$n[!r_positive]) %>% |
|
1506 |
- dplyr::mutate(ratio_pos_raw = sum_pos / sum_neg, |
|
1507 |
- enrichment_pos = sum_pos / (sum_pos + sum_neg), |
|
1508 |
- ratio_neg = 1 - enrichment_pos) %>% |
|
1509 |
- dplyr::ungroup() |
|
1510 |
- |
|
1511 |
- # Compare between real and permuted |
|
1512 |
- normFactor_real = dplyr::filter(dsum, class == !! (networkType_details[1])) %>% dplyr::pull(sum_n) %>% sum() / |
|
1513 |
- dplyr::filter(dsum, class == !! (networkType_details[2])) %>% dplyr::pull(sum_n) %>% sum() |
|
1514 |
- |
|
1515 |
- # ratio_norm not used currently, no normalization necessary here or not even useful because we dont want to normalize the r_pos and r_neg ratios: These are signal in a way. Only when comparing between real and permuted, we have to account for sample size for corrections |
|
1516 |
- d3 = d %>% |
|
1517 |
- dplyr::group_by(peak_gene.p.raw.class, r_positive) %>% |
|
1518 |
- dplyr::summarise(n_real = .data$n[class == !! (names(colors_vec)[1]) ], |
|
1519 |
- n_permuted = .data$n[class == !! (names(colors_vec)[2]) ]) %>% |
|
1520 |
- dplyr::ungroup() %>% |
|
1521 |
- dplyr::mutate(ratio_real_raw = n_real / n_permuted, |
|
1522 |
- ratio_real_norm = ratio_real_raw / normFactor_real, |
|
1523 |
- enrichment_real = n_real / (n_real + n_permuted), |
|
1524 |
- enrichment_real_norm = (n_real / normFactor_real) / ((n_real / normFactor_real) + n_permuted)) |
|
1525 |
- |
|
1526 |
- |
|
1527 |
- stopifnot(identical(levels(d2$peak_gene.p.raw.class), levels(d3$peak_gene.p.raw.class))) |
|
1528 |
- # 2 enrichment bar plots but combined using facet_wrap |
|
1529 |
- d2$set = "r+ / r-"; d3$set = "real / permuted" |
|
1530 |
- d_merged <- tibble::tibble(peak_gene.p.raw.class = c(as.character(d2$peak_gene.p.raw.class), |
|
1531 |
- as.character(d3$peak_gene.p.raw.class)), |
|
1532 |
- ratio = c(d2$ratio_pos_raw, d3$ratio_real_norm), |
|
1533 |
- classAll = c(as.character(d2$class), d3$r_positive), |
|
1534 |
- set = c(d2$set, d3$set)) %>% |
|
1535 |
- dplyr::mutate(classAll = factor(classAll, levels=c(paste0("real_",range), paste0("random_",range), "TRUE", "FALSE")), |
|
1536 |
- peak_gene.p.raw.class = factor(peak_gene.p.raw.class, levels = levels(d2$peak_gene.p.raw.class))) |
|
1537 |
- |
|
1538 |
- d4 = tibble::tibble(peak_gene.p.raw.class = unique(d$peak_gene.p.raw.class), |
|
1539 |
- n_rpos_real = NA_integer_, n_rpos_random = NA_integer_, |
|
1540 |
- n_rneg_real = NA_integer_, n_rneg_random = NA_integer_, |
|
1541 |
- ratio_permuted_real_rpos_norm = NA_real_, |
|
1542 |
- ratio_permuted_real_rneg_norm = NA_real_) |
|
1543 |
- |
|
1544 |
- for (i in seq_len(nrow(d4))) { |
|
1545 |
- row_d2 = which(d2$class == networkType_details[1] & d2$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i]) |
|
1546 |
- stopifnot(length(row_d2) == 1) |
|
1547 |
- d4[i, "n_rpos_real"] = d2[row_d2, "sum_pos"] %>% unlist() |
|
1548 |
- d4[i, "n_rneg_real"] = d2[row_d2, "sum_neg"] %>% unlist() |
|
1549 |
- row_d2 = which(d2$class == paste0("random_",range) & d2$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i]) |
|
1550 |
- d4[i, "n_rpos_random"] = d2[row_d2, "sum_pos"] %>% unlist() |
|
1551 |
- d4[i, "n_rneg_random"] = d2[row_d2, "sum_neg"] %>% unlist() |
|
1552 |
- |
|
1553 |
- row_d3 = which(d3$r_positive == TRUE & d3$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i]) |
|
1554 |
- d4[i, "ratio_permuted_real_rpos_norm"] = 1- d3[row_d3, "ratio_real_norm"] %>% unlist() |
|
1555 |
- row_d3 = which(d3$r_positive == FALSE & d3$peak_gene.p.raw.class == d4$peak_gene.p.raw.class[i]) |
|
1556 |
- d4[i, "ratio_permuted_real_rneg_norm"] = 1- d3[row_d3, "ratio_real_norm"] %>% unlist() |
|
1557 |
- } |
|
1558 |
- |
|
1559 |
- d4 = d4 %>% |
|
1560 |
- dplyr::mutate(ratio_rneg_rpos_real = n_rneg_real / (n_rneg_real + n_rpos_real), |
|
1561 |
- ratio_rneg_rpos_random = n_rneg_random / (n_rneg_random + n_rpos_random), |
|
1562 |
- peak_gene.p.raw.class.bin = as.numeric(peak_gene.p.raw.class)) %>% |
|
1563 |
- dplyr::arrange(peak_gene.p.raw.class.bin) |
|
1564 |
- |
|
1565 |
- d4_melt = reshape2::melt(d4, id = c("peak_gene.p.raw.class.bin", "peak_gene.p.raw.class")) %>% |
|
1566 |
- dplyr::filter(grepl("ratio", variable)) |
|
1567 |
- |
|
1568 |
- |
|
1569 |
- list(d = d, d2 = d2, d3 = d3, d4 = d4, d4_melt = d4_melt, d_merged = d_merged) |
|
1570 |
- |
|
1571 |
-} |
|
1572 |
- |
|
1573 | 1543 |
|
1574 | 1544 |
|
1575 | 1545 |
.customLabeler <- function(tbl_freq) { |
... | ... |
@@ -1578,9 +1548,7 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1578 | 1548 |
ggplot2::as_labeller(tbl_freq_label) |
1579 | 1549 |
} |
1580 | 1550 |
|
1581 |
-.classFreq_label <- function(tbl_freq) { |
|
1582 |
- paste0(names(tbl_freq), " (", tbl_freq, ")") |
|
1583 |
-} |
|
1551 |
+ |
|
1584 | 1552 |
|
1585 | 1553 |
|
1586 | 1554 |
###### Connection summaries ######## |
... | ... |
@@ -1594,6 +1562,7 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1594 | 1562 |
#' @template plotAsPDF |
1595 | 1563 |
#' @template pdf_width |
1596 | 1564 |
#' @template pdf_height |
1565 |
+#' @template pages |
|
1597 | 1566 |
#' @template forceRerun |
1598 | 1567 |
#' @return The same \code{\linkS4class{GRN}} object, without modifications. In addition, for the specified \code{type}, a PDF file (default filename is \code{GRN.connectionSummary_{type}.pdf}) is produced with a connection summary. |
1599 | 1568 |
#' @examples |
... | ... |
@@ -1604,7 +1573,7 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1604 | 1573 |
#' @importFrom circlize colorRamp2 |
1605 | 1574 |
plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1606 | 1575 |
outputFolder = NULL, basenameOutput = NULL, |
1607 |
- plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, |
|
1576 |
+ plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, pages = NULL, |
|
1608 | 1577 |
forceRerun = FALSE) { |
1609 | 1578 |
|
1610 | 1579 |
GRN = .addFunctionLogToObject(GRN) |
... | ... |
@@ -1614,6 +1583,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1614 | 1583 |
checkmate::assertFlag(plotAsPDF) |
1615 | 1584 |
checkmate::assertNumeric(pdf_width, lower = 5, upper = 99) |
1616 | 1585 |
checkmate::assertNumeric(pdf_height, lower = 5, upper = 99) |
1586 |
+ checkmate::assert(checkmate::checkNull(pages), checkmate::checkIntegerish(pages, lower = 1)) |
|
1617 | 1587 |
checkmate::assertFlag(forceRerun) |
1618 | 1588 |
checkmate::assert(checkmate::checkNull(outputFolder), checkmate::checkCharacter(outputFolder, min.chars = 1)) |
1619 | 1589 |
checkmate::assert(checkmate::checkNull(basenameOutput), checkmate::checkCharacter(basenameOutput, len = 1, min.chars = 1, any.missing = FALSE)) |
... | ... |
@@ -1627,7 +1597,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1627 | 1597 |
file = NULL |
1628 | 1598 |
} |
1629 | 1599 |
|
1630 |
- .plot_stats_connectionSummaryHeatmap(GRN, file = file, pdf_width = pdf_width, pdf_height = pdf_height, forceRerun = forceRerun) |
|
1600 |
+ .plot_stats_connectionSummaryHeatmap(GRN, file = file, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, forceRerun = forceRerun) |
|
1631 | 1601 |
|
1632 | 1602 |
} else if (type == "boxplot") { |
1633 | 1603 |
|
... | ... |
@@ -1637,7 +1607,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1637 | 1607 |
file = NULL |
1638 | 1608 |
} |
1639 | 1609 |
|
1640 |
- .plot_stats_connectionSummaryBoxplot(GRN, file = file, pdf_width = pdf_width, pdf_height = pdf_height, forceRerun = forceRerun) |
|
1610 |
+ .plot_stats_connectionSummaryBoxplot(GRN, file = file, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, forceRerun = forceRerun) |
|
1641 | 1611 |
|
1642 | 1612 |
} else if (type == "density") { |
1643 | 1613 |
|
... | ... |
@@ -1647,7 +1617,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1647 | 1617 |
} else { |
1648 | 1618 |
file = NULL |
1649 | 1619 |
} |
1650 |
- .plot_stats_connectionSummaryDensity(GRN, file = file, pdf_width = pdf_width, pdf_height = pdf_height, forceRerun = forceRerun) |
|
1620 |
+ .plot_stats_connectionSummaryDensity(GRN, file = file, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, forceRerun = forceRerun) |
|
1651 | 1621 |
} |
1652 | 1622 |
|
1653 | 1623 |
GRN |
... | ... |
@@ -1655,13 +1625,13 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1655 | 1625 |
} |
1656 | 1626 |
|
1657 | 1627 |
|
1658 |
-.plot_stats_connectionSummaryHeatmap <- function(GRN, file = NULL, pdf_width = 12, pdf_height = 12, forceRerun = FALSE) { |
|
1628 |
+.plot_stats_connectionSummaryHeatmap <- function(GRN, file = NULL, pdf_width = 12, pdf_height = 12, pages = NULL, forceRerun = FALSE) { |
|
1659 | 1629 |
|
1660 | 1630 |
start = Sys.time() |
1661 | 1631 |
|
1662 | 1632 |
futile.logger::flog.info(paste0("Plotting connection summary", dplyr::if_else(is.null(file), "", paste0(" to file ", file)))) |
1663 | 1633 |
|
1664 |
- if ((!is.null(file) && !file.exists(file))| forceRerun) { |
|
1634 |
+ if ((!is.null(file) && !file.exists(file)) | is.null(file) | forceRerun) { |
|
1665 | 1635 |
|
1666 | 1636 |
if (nrow(GRN@stats$connections) == 0) { |
1667 | 1637 |
message = paste0("Statistics summary missing from object, please run the function generateStatsSummary first") |
... | ... |
@@ -1676,7 +1646,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1676 | 1646 |
grDevices::pdf(file = file, width = pdf_width, height = pdf_height) |
1677 | 1647 |
} |
1678 | 1648 |
|
1679 |
- |
|
1649 |
+ pageCounter = 1 |
|
1680 | 1650 |
for (allowMissingTFsCur in unique(GRN@stats$connections$allowMissingTFs)) { |
1681 | 1651 |
|
1682 | 1652 |
for (allowMissingGenesCur in unique(GRN@stats$connections$allowMissingGenes)) { |
... | ... |
@@ -1723,30 +1693,31 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1723 | 1693 |
permIndex = as.character(permCur) |
1724 | 1694 |
permSuffix = paste0(dplyr::if_else(permCur == 0, " (real)", " (permuted)")) |
1725 | 1695 |
|
1726 |
- titlePlot = paste0("Number of unique ", gsub("^n", "", elemCur), permSuffix, |
|
1727 |
- "\nallowMissingTFs: ", allowMissingTFsCur, |
|
1728 |
- ", allowMissingGenes: ", allowMissingGenesCur, |
|
1729 |
- ",\n TF_peak.connectionType: ", TF_peak.connectionTypeCur) |
|
1730 |
- |
|
1731 |
- |
|
1732 |
- colors = circlize::colorRamp2(c(0, maxDataRange), c("white", "red")) |
|
1733 |
- |
|
1734 |
- ComplexHeatmap::Heatmap( |
|
1735 |
- plotData.l[[permIndex]], |
|
1736 |
- name = "Number of\nconnections", |
|
1737 |
- col = colors, |
|
1738 |
- cluster_columns = FALSE, cluster_rows = FALSE, |
|
1739 |
- row_names_side = "right", row_names_gp = grid::gpar(fontsize = 10), |
|
1740 |
- column_title = titlePlot, |
|
1741 |
- column_names_gp = grid::gpar(fontsize = 10), |
|
1742 |
- row_title = "TF-peak FDR", |
|
1743 |
- cell_fun = function(j, i, x, y, width, height, fill) { |
|
1744 |
- grid::grid.text(sprintf("%.0f", plotData.l[[permIndex]][i, j]), x, y, gp = grid::gpar(fontsize = 10)) |
|
1745 |
- } |
|
1746 |
- ) %>% plot() |
|
1747 |
- |
|
1748 |
- |
|
1749 |
- |
|
1696 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1697 |
+ |
|
1698 |
+ titlePlot = paste0("Number of unique ", gsub("^n", "", elemCur), permSuffix, |
|
1699 |
+ "\nallowMissingTFs: ", allowMissingTFsCur, |
|
1700 |
+ ", allowMissingGenes: ", allowMissingGenesCur, |
|
1701 |
+ ",\n TF_peak.connectionType: ", TF_peak.connectionTypeCur) |
|
1702 |
+ |
|
1703 |
+ colors = circlize::colorRamp2(c(0, maxDataRange), c("white", "red")) |
|
1704 |
+ |
|
1705 |
+ ComplexHeatmap::Heatmap( |
|
1706 |
+ plotData.l[[permIndex]], |
|
1707 |
+ name = "Number of\nconnections", |
|
1708 |
+ col = colors, |
|
1709 |
+ cluster_columns = FALSE, cluster_rows = FALSE, |
|
1710 |
+ row_names_side = "right", row_names_gp = grid::gpar(fontsize = 10), |
|
1711 |
+ column_title = titlePlot, |
|
1712 |
+ column_names_gp = grid::gpar(fontsize = 10), |
|
1713 |
+ row_title = "TF-peak FDR", |
|
1714 |
+ cell_fun = function(j, i, x, y, width, height, fill) { |
|
1715 |
+ grid::grid.text(sprintf("%.0f", plotData.l[[permIndex]][i, j]), x, y, gp = grid::gpar(fontsize = 30)) |
|
1716 |
+ } |
|
1717 |
+ ) %>% plot() |
|
1718 |
+ |
|
1719 |
+ } |
|
1720 |
+ pageCounter = pageCounter + 1 |
|
1750 | 1721 |
|
1751 | 1722 |
} |
1752 | 1723 |
|
... | ... |
@@ -1757,6 +1728,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1757 | 1728 |
} |
1758 | 1729 |
} |
1759 | 1730 |
|
1731 |
+ .printWarningPageNumber(pages, pageCounter) |
|
1760 | 1732 |
if (!is.null(file)) { |
1761 | 1733 |
grDevices::dev.off() |
1762 | 1734 |
} |
... | ... |
@@ -1768,7 +1740,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1768 | 1740 |
|
1769 | 1741 |
} |
1770 | 1742 |
|
1771 |
-.plot_stats_connectionSummaryDensity <- function(GRN, file = file, TF_peak.connectionType, allowMissingTFs = FALSE, allowMissingGenes = FALSE, pdf_width = 12, pdf_height = 12, forceRerun = FALSE) { |
|
1743 |
+.plot_stats_connectionSummaryDensity <- function(GRN, file = file, TF_peak.connectionType, allowMissingTFs = FALSE, allowMissingGenes = FALSE, pdf_width = 12, pdf_height = 12, pages = NULL, forceRerun = FALSE) { |
|
1772 | 1744 |
|
1773 | 1745 |
# TODO |
1774 | 1746 |
# Same for both permutations, in one plot |
... | ... |
@@ -1788,11 +1760,11 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1788 | 1760 |
|
1789 | 1761 |
|
1790 | 1762 |
# Compare real and permuted one, replicate Darias plots |
1791 |
-.plot_stats_connectionSummaryBoxplot <- function(GRN, file = NULL, pdf_width = 12, pdf_height = 12, forceRerun = FALSE) { |
|
1763 |
+.plot_stats_connectionSummaryBoxplot <- function(GRN, file = NULL, pdf_width = 12, pdf_height = 12, pages = NULL, forceRerun = FALSE) { |
|
1792 | 1764 |
|
1793 | 1765 |
start = Sys.time() |
1794 | 1766 |
|
1795 |
- if ((!is.null(file) && !file.exists(file))| forceRerun) { |
|
1767 |
+ if ((!is.null(file) && !file.exists(file))| is.null(file) | forceRerun) { |
|
1796 | 1768 |
|
1797 | 1769 |
|
1798 | 1770 |
futile.logger::flog.info(paste0("Plotting diagnostic plots for network connections", dplyr::if_else(is.null(file), "", paste0(" to file ", file)))) |
... | ... |
@@ -1816,7 +1788,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1816 | 1788 |
pb <- progress::progress_bar$new(total = nElemsTotal) |
1817 | 1789 |
|
1818 | 1790 |
|
1819 |
- |
|
1791 |
+ pageCounter = 1 |
|
1820 | 1792 |
for (TF_peak.fdr_cur in rev(TF_peak.fdrs)) { |
1821 | 1793 |
|
1822 | 1794 |
for (peak_gene.fdr_cur in peak_gene.fdrs) { |
... | ... |
@@ -1828,64 +1800,70 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1828 | 1800 |
for (TF_peak.connectionTypeCur in .getAll_TF_peak_connectionTypes(GRN)) { |
1829 | 1801 |
|
1830 | 1802 |
pb$tick() |
1831 |
- elemCur = "TF" |
|
1832 |
- dataCur0.df = as.data.frame(stats_details.l[["0"]][[TF_peak.fdr_cur]][[peak_gene.fdr_cur]] [[allowMissingTFsCur]] [[allowMissingGenesCur]] [[TF_peak.connectionTypeCur]] [[elemCur]]) |
|
1833 |
- dataCur1.df = as.data.frame(stats_details.l[["1"]][[TF_peak.fdr_cur]][[peak_gene.fdr_cur]] [[allowMissingTFsCur]] [[allowMissingGenesCur]] [[TF_peak.connectionTypeCur]] [[elemCur]]) |
|
1834 |
- |
|
1835 |
- if (nrow(dataCur0.df) == 0) { |
|
1836 |
- dataCur0.df = data.frame(. = "MISSING", Freq = 0) |
|
1837 |
- } |
|
1838 |
- if (nrow(dataCur1.df) == 0) { |
|
1839 |
- dataCur1.df = data.frame(. = "MISSING", Freq = 0) |
|
1803 |
+ |
|
1804 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
1805 |
+ |
|
1806 |
+ elemCur = "TF" |
|
1807 |
+ dataCur0.df = as.data.frame(stats_details.l[["0"]][[TF_peak.fdr_cur]][[peak_gene.fdr_cur]] [[allowMissingTFsCur]] [[allowMissingGenesCur]] [[TF_peak.connectionTypeCur]] [[elemCur]]) |
|
1808 |
+ dataCur1.df = as.data.frame(stats_details.l[["1"]][[TF_peak.fdr_cur]][[peak_gene.fdr_cur]] [[allowMissingTFsCur]] [[allowMissingGenesCur]] [[TF_peak.connectionTypeCur]] [[elemCur]]) |
|
1809 |
+ |
|
1810 |
+ if (nrow(dataCur0.df) == 0) { |
|
1811 |
+ dataCur0.df = data.frame(. = "MISSING", Freq = 0) |
|
1812 |
+ } |
|
1813 |
+ if (nrow(dataCur1.df) == 0) { |
|
1814 |
+ dataCur1.df = data.frame(. = "MISSING", Freq = 0) |
|
1815 |
+ } |
|
1816 |
+ |
|
1817 |
+ dataCur.df = rbind(dplyr::mutate(dataCur0.df, networkType = "real", connectionType = elemCur), |
|
1818 |
+ dplyr::mutate(dataCur1.df, networkType = "permuted", connectionType = elemCur)) %>% |
|
1819 |
+ tibble::as_tibble() |
|
1820 |
+ |
|
1821 |
+ for (elemCur in c("gene","peak.gene","peak.TF")) { |
|
1822 |
+ |
|
1823 |
+ dataCur0.df = as.data.frame(stats_details.l[["0"]][[TF_peak.fdr_cur]][[peak_gene.fdr_cur]] [[allowMissingTFsCur]] [[allowMissingGenesCur]] [[TF_peak.connectionTypeCur]] [[elemCur]]) |
|
1824 |
+ dataCur1.df = as.data.frame(stats_details.l[["1"]][[TF_peak.fdr_cur]][[peak_gene.fdr_cur]] [[allowMissingTFsCur]] [[allowMissingGenesCur]] [[TF_peak.connectionTypeCur]] [[elemCur]]) |
|
1825 |
+ |
|
1826 |
+ if (nrow(dataCur0.df) == 0) { |
|
1827 |
+ dataCur0.df = data.frame(. = "MISSING", Freq = 0) |
|
1828 |
+ } |
|
1829 |
+ if (nrow(dataCur1.df) == 0) { |
|
1830 |
+ dataCur1.df = data.frame(. = "MISSING", Freq = 0) |
|
1831 |
+ } |
|
1832 |
+ |
|
1833 |
+ dataCur.df = rbind(dataCur.df, |
|
1834 |
+ dplyr::mutate(dataCur0.df, networkType = "real", connectionType = elemCur), |
|
1835 |
+ dplyr::mutate(dataCur1.df, networkType = "permuted", connectionType = elemCur)) |
|
1836 |
+ |
|
1837 |
+ } |
|
1838 |
+ |
|
1839 |
+ dataCur.df$networkType =as.factor(dataCur.df$networkType) |
|
1840 |
+ |
|
1841 |
+ titlePlot = paste0("TF_peak.fdr: ", TF_peak.fdr_cur, |
|
1842 |
+ ", peak_gene.fdr: ", peak_gene.fdr_cur, |
|
1843 |
+ "\nallowMissingTFs: ", allowMissingTFsCur, |
|
1844 |
+ ", allowMissingGenes: ", allowMissingGenesCur, |
|
1845 |
+ ",\n TF_peak.connectionType: ", TF_peak.connectionTypeCur) |
|
1846 |
+ |
|
1847 |
+ if (max(dataCur.df$Freq)> 0) { |
|
1848 |
+ |
|
1849 |
+ g = ggplot(dataCur.df, aes(networkType, log10(Freq), fill = networkType)) + geom_boxplot() +theme_bw() + |
|
1850 |
+ xlab("Network type") +ylab(paste0("log10(Number of connections per category", ", empty = 0)")) + |
|
1851 |
+ ggtitle(titlePlot) + |
|
1852 |
+ facet_wrap(~connectionType, scales = "free") + |
|
1853 |
+ scale_fill_manual("Network type", values = c("real" = "red", "permuted" = "gray"), |
|
1854 |
+ labels = c("real" = "real", "permuted" = "permuted"), drop = FALSE) |
|
1855 |
+ |
|
1856 |
+ suppressWarnings(plot(g)) |
|
1857 |
+ |
|
1858 |
+ } else { |
|
1859 |
+ |
|
1860 |
+ plot(c(0, 1), c(0, 1), ann = FALSE, bty = 'n', type = 'n', axes=FALSE, main = titlePlot) |
|
1861 |
+ message = paste0(titlePlot, "\nNo data to show") |
|
1862 |
+ text(x = 0.5, y = 0.5, message, cex = 1.2, col = "red") |
|
1863 |
+ } |
|
1840 | 1864 |
} |
1865 |
+ pageCounter = pageCounter + 1 |
|
1841 | 1866 |
|
1842 |
- dataCur.df = rbind(dplyr::mutate(dataCur0.df, networkType = "real", connectionType = elemCur), |
|
1843 |
- dplyr::mutate(dataCur1.df, networkType = "permuted", connectionType = elemCur)) %>% |
|
1844 |
- tibble::as_tibble() |
|
1845 |
- |
|
1846 |
- for (elemCur in c("gene","peak.gene","peak.TF")) { |
|
1847 |
- |
|
1848 |
- dataCur0.df = as.data.frame(stats_details.l[["0"]][[TF_peak.fdr_cur]][[peak_gene.fdr_cur]] [[allowMissingTFsCur]] [[allowMissingGenesCur]] [[TF_peak.connectionTypeCur]] [[elemCur]]) |
|
1849 |
- dataCur1.df = as.data.frame(stats_details.l[["1"]][[TF_peak.fdr_cur]][[peak_gene.fdr_cur]] [[allowMissingTFsCur]] [[allowMissingGenesCur]] [[TF_peak.connectionTypeCur]] [[elemCur]]) |
|
1850 |
- |
|
1851 |
- if (nrow(dataCur0.df) == 0) { |
|
1852 |
- dataCur0.df = data.frame(. = "MISSING", Freq = 0) |
|
1853 |
- } |
|
1854 |
- if (nrow(dataCur1.df) == 0) { |
|
1855 |
- dataCur1.df = data.frame(. = "MISSING", Freq = 0) |
|
1856 |
- } |
|
1857 |
- |
|
1858 |
- dataCur.df = rbind(dataCur.df, |
|
1859 |
- dplyr::mutate(dataCur0.df, networkType = "real", connectionType = elemCur), |
|
1860 |
- dplyr::mutate(dataCur1.df, networkType = "permuted", connectionType = elemCur)) |
|
1861 |
- |
|
1862 |
- } |
|
1863 |
- |
|
1864 |
- dataCur.df$networkType =as.factor(dataCur.df$networkType) |
|
1865 |
- |
|
1866 |
- titlePlot = paste0("TF_peak.fdr: ", TF_peak.fdr_cur, |
|
1867 |
- ", peak_gene.fdr: ", peak_gene.fdr_cur, |
|
1868 |
- "\nallowMissingTFs: ", allowMissingTFsCur, |
|
1869 |
- ", allowMissingGenes: ", allowMissingGenesCur, |
|
1870 |
- ",\n TF_peak.connectionType: ", TF_peak.connectionTypeCur) |
|
1871 |
- |
|
1872 |
- if (max(dataCur.df$Freq)> 0) { |
|
1873 |
- |
|
1874 |
- g = ggplot(dataCur.df, aes(networkType, log10(Freq), fill = networkType)) + geom_boxplot() +theme_bw() + |
|
1875 |
- xlab("Network type") +ylab(paste0("log10(Number of connections per category", ", empty = 0)")) + |
|
1876 |
- ggtitle(titlePlot) + |
|
1877 |
- facet_wrap(~connectionType, scales = "free") + |
|
1878 |
- scale_fill_manual("Network type", values = c("real" = "red", "permuted" = "gray"), |
|
1879 |
- labels = c("real" = "real", "permuted" = "permuted"), drop = FALSE) |
|
1880 |
- |
|
1881 |
- suppressWarnings(plot(g)) |
|
1882 |
- |
|
1883 |
- } else { |
|
1884 |
- |
|
1885 |
- plot(c(0, 1), c(0, 1), ann = FALSE, bty = 'n', type = 'n', axes=FALSE, main = titlePlot) |
|
1886 |
- message = paste0(titlePlot, "\nNo data to show") |
|
1887 |
- text(x = 0.5, y = 0.5, message, cex = 1.2, col = "red") |
|
1888 |
- } |
|
1889 | 1867 |
|
1890 | 1868 |
} #end for (TF_peak.connectionTypeCur in TF_peak.connectionTypesAll) |
1891 | 1869 |
|
... | ... |
@@ -1897,6 +1875,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1897 | 1875 |
|
1898 | 1876 |
} # end for TF_peak.fdr_cur |
1899 | 1877 |
|
1878 |
+ .printWarningPageNumber(pages, pageCounter) |
|
1900 | 1879 |
if (!is.null(file)) { |
1901 | 1880 |
grDevices::dev.off() |
1902 | 1881 |
} |
... | ... |
@@ -1908,9 +1887,6 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1908 | 1887 |
} |
1909 | 1888 |
|
1910 | 1889 |
|
1911 |
- |
|
1912 |
- |
|
1913 |
- |
|
1914 | 1890 |
######## General & communities Stats Functions ######## |
1915 | 1891 |
|
1916 | 1892 |
|
... | ... |
@@ -1924,6 +1900,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1924 | 1900 |
#' @template plotAsPDF |
1925 | 1901 |
#' @template pdf_width |
1926 | 1902 |
#' @template pdf_height |
1903 |
+#' @template pages |
|
1927 | 1904 |
#' @return The same \code{\linkS4class{GRN}} object with no changes. The results are output to a file. |
1928 | 1905 |
#' @seealso \code{\link{plotGeneralEnrichment}} |
1929 | 1906 |
#' @seealso \code{\link{plotCommunitiesStats}} |
... | ... |
@@ -1934,7 +1911,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1934 | 1911 |
#' GRN = plotGeneralGraphStats(GRN, plotAsPDF = FALSE) |
1935 | 1912 |
#' @export |
1936 | 1913 |
plotGeneralGraphStats <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
1937 |
- plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, |
|
1914 |
+ plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, pages = NULL, |
|
1938 | 1915 |
forceRerun = FALSE) { |
1939 | 1916 |
|
1940 | 1917 |
start = Sys.time() |
... | ... |
@@ -1944,6 +1921,7 @@ plotGeneralGraphStats <- function(GRN, outputFolder = NULL, basenameOutput = NUL |
1944 | 1921 |
checkmate::assertFlag(plotAsPDF) |
1945 | 1922 |
checkmate::assertNumeric(pdf_width, lower = 5, upper = 99) |
1946 | 1923 |
checkmate::assertNumeric(pdf_height, lower = 5, upper = 99) |
1924 |
+ checkmate::assert(checkmate::checkNull(pages), checkmate::checkIntegerish(pages, lower = 1)) |
|
1947 | 1925 |
checkmate::assertFlag(forceRerun) |
1948 | 1926 |
checkmate::assert(checkmate::checkNull(outputFolder), checkmate::checkCharacter(outputFolder, min.chars = 1)) |
1949 | 1927 |
checkmate::assert(checkmate::checkNull(basenameOutput), checkmate::checkCharacter(basenameOutput, len = 1, min.chars = 1, any.missing = FALSE)) |
... | ... |
@@ -1975,52 +1953,59 @@ plotGeneralGraphStats <- function(GRN, outputFolder = NULL, basenameOutput = NUL |
1975 | 1953 |
plot.title = element_text(hjust = 0.5)) |
1976 | 1954 |
|
1977 | 1955 |
|
1978 |
- gVertexDist = ggplot(totalVerteces, aes(x="", y=Count, fill=Class)) + geom_bar(stat="identity") + |
|
1979 |
- coord_polar("y", start=0) + |
|
1980 |
- scale_fill_manual(values = c("#3B9AB2", "#F21A00", "#E1AF00")) + |
|
1981 |
- geom_text(aes(label = paste0(round(Count/sum(Count) *100, digits = 1), "%")), |
|
1982 |
- position = position_stack(vjust = 0.5)) + |
|
1983 |
- theme_plots + |
|
1984 |
- ggtitle(paste0("Vertices (n=", sum(totalVerteces$Count), ")")) |
|
1985 |
- |
|
1986 |
- geneDist = as.data.frame(table(droplevels(GRN@connections$all.filtered$`0`$gene.type))) |
|
1987 |
- if (nrow(geneDist) == 0) { |
|
1988 |
- gGeneDist = |
|
1989 |
- ggplot() + |
|
1990 |
- annotate("text", x = 4, y = 25, size=5, color = "red", label = "Nothing to plot:\nNo genes found") + |
|
1991 |
- theme_void() |
|
1992 |
- |
|
1993 |
- |
|
1994 |
- message = "No genes found in the GRN object. Make sure the filtered connections contain also genes." |
|
1995 |