... | ... |
@@ -2580,11 +2580,11 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
2580 | 2580 |
|
2581 | 2581 |
|
2582 | 2582 |
# Finally, do the actual overlaps |
2583 |
- overlapsAll = GenomicRanges::findOverlaps(query, subject, |
|
2583 |
+ overlapsAll = suppressWarnings(GenomicRanges::findOverlaps(query, subject, |
|
2584 | 2584 |
minoverlap = 1, |
2585 | 2585 |
type = "any", |
2586 | 2586 |
select = "all", |
2587 |
- ignore.strand = TRUE) |
|
2587 |
+ ignore.strand = TRUE)) |
|
2588 | 2588 |
|
2589 | 2589 |
|
2590 | 2590 |
query_row_ids = S4Vectors::queryHits(overlapsAll) |
... | ... |
@@ -4211,7 +4211,8 @@ getGRNConnections <- function(GRN, type = "all.filtered", permuted = FALSE, inc |
4211 | 4211 |
"plot_communityStats" = "GRN.community_stats", |
4212 | 4212 |
"plot_communityEnrichment" = "GRN.community_enrichment", |
4213 | 4213 |
"plot_generalNetworkStats" = "GRN.overall_stats", |
4214 |
- "plot_TFEnrichment" = "GRN.TF_enrichment" |
|
4214 |
+ "plot_TFEnrichment" = "GRN.TF_enrichment", |
|
4215 |
+ "plot_network" = "GRN.network_visualisation" |
|
4215 | 4216 |
|
4216 | 4217 |
) |
4217 | 4218 |
|
... | ... |
@@ -6,7 +6,7 @@ |
6 | 6 |
#' |
7 | 7 |
#' @template GRN |
8 | 8 |
#' @template outputFolder |
9 |
-#' @param data Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data). |
|
9 |
+#' @param data Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Default \code{c("rna", "peaks")}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data). |
|
10 | 10 |
#' @param topn Integer vector. Default \code{c(500,1000,5000)}. Number of top variable features to do PCA for. Can be a vector of different numbers (see default). |
11 | 11 |
#' @param type Character. One of or a combination of \code{"raw"}, \code{"normalized"}, \code{"all"}. Default \code{c("raw", "normalized")}. Should the PCA plots be done based on the raw or normalized data, respectively? |
12 | 12 |
#' @template forceRerun |
... | ... |
@@ -525,6 +525,9 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
525 | 525 |
if (!permutationCur %in% dataType2) { |
526 | 526 |
next |
527 | 527 |
} |
528 |
+ |
|
529 |
+ futile.logger::flog.info(paste0("\n Plotting for permutation ", permutationCur)) |
|
530 |
+ |
|
528 | 531 |
|
529 | 532 |
suffixFile = .getPermutationSuffixStr(permutationCur) |
530 | 533 |
|
... | ... |
@@ -677,8 +680,15 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
677 | 680 |
|
678 | 681 |
pageCounter = 1 |
679 | 682 |
|
680 |
- plots.l = list() |
|
681 |
- index = 1 |
|
683 |
+ if (!is.null(file)) { |
|
684 |
+ .checkOutputFile(file) |
|
685 |
+ grDevices::pdf(file, width = width, height = height) |
|
686 |
+ futile.logger::flog.info(paste0("Plotting to file ", file)) |
|
687 |
+ } else { |
|
688 |
+ futile.logger::flog.info(paste0("Plotting directly")) |
|
689 |
+ } |
|
690 |
+ |
|
691 |
+ |
|
682 | 692 |
|
683 | 693 |
# Dont take all TF, some might be missing. |
684 | 694 |
connections_TF_peak = GRN@connections$TF_peaks[[as.character(perm)]]$connectionStats |
... | ... |
@@ -692,10 +702,10 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
692 | 702 |
|
693 | 703 |
pb <- progress::progress_bar$new(total = nTF) |
694 | 704 |
|
695 |
- levels_pos<-unique(as.character(cut(GRN@config$parameters$internal$stepsFDR, breaks = GRN@config$parameters$internal$stepsFDR, right = FALSE, include.lowest = TRUE ))) |
|
696 |
- levels_neg<-unique(as.character(cut(GRN@config$parameters$internal$stepsFDR, breaks = rev(GRN@config$parameters$internal$stepsFDR), right = TRUE, include.lowest = TRUE ))) |
|
705 |
+ steps = GRN@config$parameters$internal$stepsFDR |
|
697 | 706 |
|
698 |
- |
|
707 |
+ levels_pos<-unique(as.character(cut(steps, breaks = steps, right = FALSE, include.lowest = TRUE ))) |
|
708 |
+ levels_neg<-unique(as.character(cut(steps, breaks = rev(steps), right = TRUE, include.lowest = TRUE ))) |
|
699 | 709 |
|
700 | 710 |
for (i in seq_len(nTF)) { |
701 | 711 |
pb$tick() |
... | ... |
@@ -704,6 +714,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
704 | 714 |
connections_TF_peakCur = dplyr::filter(connections_TF_peak, TF.name == TFCur) |
705 | 715 |
|
706 | 716 |
# Produce two FDR plots, coming from both directions |
717 |
+ plotsCur.l = list() |
|
707 | 718 |
for (typeCur in c("pos","neg")) { |
708 | 719 |
|
709 | 720 |
if (typeCur == "pos") { |
... | ... |
@@ -718,6 +729,8 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
718 | 729 |
dplyr::mutate(TF_peak.r_bin = factor(TF_peak.r_bin, levels = levelsCur)) %>% |
719 | 730 |
reshape2::melt(id = c("TF_peak.r_bin", "TF_peak.connectionType", "n")) |
720 | 731 |
|
732 |
+ plotTitle = paste0(TFCur, ": Direction ", typeCur, ifelse(perm == 0, "", "(permuted)")) |
|
733 |
+ |
|
721 | 734 |
if (!useGCCorrection) { |
722 | 735 |
|
723 | 736 |
g = suppressWarnings(connections_TF_peak.filt %>% |
... | ... |
@@ -725,7 +738,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
725 | 738 |
ggplot(aes(TF_peak.r_bin, value, color = .data$n, shape = variable)) + |
726 | 739 |
geom_point(na.rm = TRUE) + |
727 | 740 |
facet_wrap(~TF_peak.connectionType, ncol = 1) + |
728 |
- labs(x="Correlation bin", y="FDR TF-peak", title= paste0(TFCur, ": Direction ", typeCur)) + |
|
741 |
+ labs(x="Correlation bin", y="FDR TF-peak", title= plotTitle) + |
|
729 | 742 |
theme_bw() + |
730 | 743 |
theme(axis.text.x=element_text(angle=60, hjust=1, size= 8)) + |
731 | 744 |
scale_x_discrete(drop=FALSE) + |
... | ... |
@@ -735,6 +748,8 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
735 | 748 |
values = c("TF_peak.fdr" = 1)) + |
736 | 749 |
scale_color_viridis_c("No. of connections") |
737 | 750 |
) |
751 |
+ plotsCur.l[[typeCur]] = g |
|
752 |
+ |
|
738 | 753 |
|
739 | 754 |
if (plotDetails) { |
740 | 755 |
|
... | ... |
@@ -742,7 +757,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
742 | 757 |
ggplot(aes(TF_peak.r_bin, log10(value+1), color = variable)) + |
743 | 758 |
geom_point(size = 1) + |
744 | 759 |
facet_wrap(~TF_peak.connectionType, ncol = 1) + |
745 |
- labs(x="Correlation bin", y="log10(Observed frequencies +1)", title= paste0(TFCur, ": Direction ", typeCur)) + |
|
760 |
+ labs(x="Correlation bin", y="log10(Observed frequencies +1)", title= plotTitle) + |
|
746 | 761 |
theme_bw() + |
747 | 762 |
theme(axis.text.x=element_text(angle=60, hjust=1, size= 8)) + |
748 | 763 |
scale_x_discrete(drop=FALSE) + |
... | ... |
@@ -753,6 +768,8 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
753 | 768 |
labels = c("tpvalue" = "True positives\n(foreground, scaling ref.)\n", |
754 | 769 |
"fpvalue" = "False positives\n(background, unscaled)\n", |
755 | 770 |
"fpvalue_norm" = "False positives\n(background, scaled to ref.)")) |
771 |
+ |
|
772 |
+ plotsCur.l[[paste0(typeCur, "_details")]] = g |
|
756 | 773 |
} |
757 | 774 |
|
758 | 775 |
|
... | ... |
@@ -764,7 +781,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
764 | 781 |
ggplot(aes(TF_peak.r_bin, value, color = .data$n, shape = variable, alpha = variable)) + |
765 | 782 |
geom_point(na.rm = TRUE) + |
766 | 783 |
facet_wrap(~TF_peak.connectionType, ncol = 1) + |
767 |
- labs(x="Correlation bin", y="FDR TF-peak", title= paste0(TFCur, ": Direction ", typeCur)) + |
|
784 |
+ labs(x="Correlation bin", y="FDR TF-peak", title= plotTitle) + |
|
768 | 785 |
theme_bw() + |
769 | 786 |
theme(axis.text.x=element_text(angle=60, hjust=1, size= 8)) + |
770 | 787 |
scale_x_discrete(drop=FALSE) + |
... | ... |
@@ -774,13 +791,15 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
774 | 791 |
values = c("TF_peak.fdr" = 3,"TF_peak.fdr_orig" = 1)) + |
775 | 792 |
scale_color_viridis_c("No. of connections") + |
776 | 793 |
scale_alpha_discrete(range = c("TF_peak.fdr" = 0.5,"TF_peak.fdr_orig" = 1)) + |
777 |
- guides(alpha = FALSE)) |
|
794 |
+ guides(alpha = FALSE) |
|
795 |
+ ) |
|
796 |
+ plotsCur.l[[typeCur]] = g |
|
778 | 797 |
|
779 | 798 |
if (plotDetails) { |
780 | 799 |
g2 = ggplot(connections_TF_peak.filt %>% dplyr::filter(grepl("value", variable)), aes(TF_peak.r_bin, log10(value+1), color = variable)) + |
781 | 800 |
geom_point(size = 1) + |
782 | 801 |
facet_wrap(~TF_peak.connectionType, ncol = 1) + |
783 |
- labs(x="Correlation bin", y="log10(Observed frequencies +1)", title= paste0(TFCur, ": Direction ", typeCur)) + |
|
802 |
+ labs(x="Correlation bin", y="log10(Observed frequencies +1)", title= plotTitle) + |
|
784 | 803 |
theme_bw() + |
785 | 804 |
theme(axis.text.x=element_text(angle=60, hjust=1, size= 8)) + |
786 | 805 |
scale_x_discrete(drop=FALSE) + |
... | ... |
@@ -794,41 +813,36 @@ plotDiagnosticPlots_TFPeaks <- function(GRN, |
794 | 813 |
"fpvalue_norm_orig" = "False positives without GC\n(background, scaled to ref.)\n", |
795 | 814 |
"fpvalue" = "False positives with GC\n(background, unscaled)\n", |
796 | 815 |
"fpvalue_norm" = "False positives with GC\n(background, scaled to ref.)")) |
816 |
+ |
|
817 |
+ plotsCur.l[[paste0(typeCur, "_details")]] = g |
|
797 | 818 |
} |
798 | 819 |
|
799 | 820 |
} |
800 | 821 |
|
801 |
- # We now created either just one plot g or g + g2 for "one" direction |
|
802 |
- if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
803 |
- plots.l[[index]] = g |
|
804 |
- index = index + 1 |
|
805 |
- } |
|
806 |
- |
|
807 |
- |
|
808 |
- if (plotDetails) { |
|
809 |
- |
|
810 |
- if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
811 |
- plots.l[[index]] = g2 |
|
812 |
- index = index + 1 |
|
813 |
- } |
|
814 |
- |
|
815 |
- # Only here we already have a new page, as we produced 2 plots in this iteration alone |
|
816 |
- pageCounter = pageCounter + 1 |
|
817 |
- } |
|
818 |
- |
|
819 |
- |
|
820 | 822 |
} # end for both directions |
821 | 823 |
|
822 |
- if (!plotDetails) { |
|
824 |
+ # Lets create the plots |
|
825 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
826 |
+ plots_all = plotsCur.l$pos / plotsCur.l$neg |
|
827 |
+ plot(plots_all) |
|
828 |
+ } |
|
829 |
+ pageCounter = pageCounter + 1 |
|
830 |
+ |
|
831 |
+ if (plotDetails) { |
|
832 |
+ if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) { |
|
833 |
+ plots_all = plotsCur.l$pos_details / plotsCur.l$neg_details |
|
834 |
+ plot(plots_all) |
|
835 |
+ } |
|
823 | 836 |
pageCounter = pageCounter + 1 |
824 | 837 |
} |
825 |
- |
|
838 |
+ |
|
839 |
+ |
|
826 | 840 |
|
827 | 841 |
} # end allTF |
828 | 842 |
|
829 | 843 |
.printWarningPageNumber(pages, pageCounter) |
830 |
- |
|
831 |
- .printMultipleGraphsPerPage(plots.l, nCol = 1, nRow = 2, pdfFile = file, height = height, width = width) |
|
844 |
+ |
|
845 |
+ if (!is.null(file)) dev.off() |
|
832 | 846 |
|
833 | 847 |
.printExecutionTime(start) |
834 | 848 |
} |
... | ... |
@@ -1712,7 +1726,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap", |
1712 | 1726 |
column_names_gp = grid::gpar(fontsize = 10), |
1713 | 1727 |
row_title = "TF-peak FDR", |
1714 | 1728 |
cell_fun = function(j, i, x, y, width, height, fill) { |
1715 |
- grid::grid.text(sprintf("%.0f", plotData.l[[permIndex]][i, j]), x, y, gp = grid::gpar(fontsize = 30)) |
|
1729 |
+ grid::grid.text(sprintf("%.0f", plotData.l[[permIndex]][i, j]), x, y, gp = grid::gpar(fontsize = 20)) |
|
1716 | 1730 |
} |
1717 | 1731 |
) %>% plot() |
1718 | 1732 |
|
... | ... |
@@ -3240,55 +3254,69 @@ plotTFEnrichment <- function(GRN, rankType = "degree", n = NULL, TF.names = NULL |
3240 | 3254 |
|
3241 | 3255 |
############### GRN visualization ############### |
3242 | 3256 |
|
3243 |
-#' Visualize a filtered GRN (in development) |
|
3257 |
+#' Visualize a filtered GRN. |
|
3244 | 3258 |
#' |
3245 | 3259 |
#' @template GRN |
3246 | 3260 |
#' @template permuted |
3247 |
-#' @param file TODO |
|
3248 |
-#' @param title TODO |
|
3249 |
-#' @param maxRowsToPlot TODO |
|
3250 |
-#' @param useDefaultMetadata TODO |
|
3251 |
-#' @param vertice_color_TFs TODO |
|
3252 |
-#' @param vertice_color_peaks TODO |
|
3253 |
-#' @param vertice_color_genes TODO |
|
3261 |
+#' @template outputFolder |
|
3262 |
+#' @template basenameOutput |
|
3254 | 3263 |
#' @template plotAsPDF |
3255 | 3264 |
#' @template pdf_width |
3256 | 3265 |
#' @template pdf_height |
3266 |
+#' @param title NULL or Character. Default NULL. Title to be assigned to the plot. |
|
3267 |
+#' @param maxRowsToPlot Numeric. Default 500. Refers to the maximum number of connections to be plotted. |
|
3268 |
+#' @param graph Character. Default \code{TF-gene}. One of: \code{TF-gene}, \code{TF-peak-gene}. Whether to plot a graph with links from TFs to peaks to gene, or the graph with the inferred TF to gene connections. |
|
3269 |
+#' @param colorby Character. Default \code{type}. One of \code{type}, code \code{community}. Color the vertices by either type (TF/peak/gene) or community. See \code{\link{calculateCommunitiesStats}} |
|
3270 |
+#' @param layered Boolean. Default \code{FALSE}. Display the network in a layered format where each layer corresponds to a node type (TF/peak/gene). |
|
3271 |
+#' @param vertice_color_TFs Named list. Default \code{list(h = 10, c = 85, l = c(25, 95))}. The list must specify the color in hcl format (hue, chroma, luminence). See the \code{\link[colorspace]} package for more details and examples |
|
3272 |
+#' @param vertice_color_peaks Named list. Default \code{list(h = 135, c = 45, l = c(35, 95))}. |
|
3273 |
+#' @param vertice_color_genes Named list. Default \code{list(h = 260, c = 80, l = c(30, 90))}. |
|
3274 |
+#' @param vertexLabel_cex Numeric. Default \code{0.4}. Font size (multiplication factor, device-dependent) |
|
3275 |
+#' @param vertexLabel_dist Numeric. Default \code{0} vertex. Distance between the label and the vertex. |
|
3257 | 3276 |
#' @template forceRerun |
3258 |
- |
|
3259 |
-#' @return TODO |
|
3260 |
-# #' @export |
|
3261 |
-visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaultMetadata = TRUE, |
|
3262 |
- graph = "TF-gene" , colorby = "type", layered = FALSE, |
|
3263 |
- vertice_color_TFs = NULL, vertice_color_peaks = NULL, vertice_color_genes = NULL, |
|
3264 |
- plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, |
|
3277 |
+#' @example |
|
3278 |
+#' GRN = visualizeGRN(GRN, maxRowsToPlot = 700, graph = "TF-peak-gene", colorby = "type") |
|
3279 |
+#' @return the GRN object |
|
3280 |
+#' @export |
|
3281 |
+visualizeGRN <- function(GRN, outputFolder = NULL, basenameOutput = NULL, plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, |
|
3282 |
+ title = NULL, maxRowsToPlot = 500, graph = "TF-gene" , colorby = "type", layered = FALSE, |
|
3283 |
+ vertice_color_TFs = list(h = 10, c = 85, l = c(25, 95)), vertice_color_peaks = list(h = 135, c = 45, l = c(35, 95)), vertice_color_genes = list(h = 260, c = 80, l = c(30, 90)), |
|
3284 |
+ vertexLabel_cex = 0.4, vertexLabel_dist = 0, |
|
3285 |
+ |
|
3265 | 3286 |
forceRerun = FALSE |
3266 | 3287 |
) { |
3267 | 3288 |
|
3289 |
+ |
|
3268 | 3290 |
start = Sys.time() |
3269 | 3291 |
GRN = .addFunctionLogToObject(GRN) |
3270 | 3292 |
|
3271 |
- #checkmate::assertFlag(plotAsPDF) |
|
3293 |
+ checkmate::assertFlag(plotAsPDF) |
|
3294 |
+ checkmate::assertNumeric(pdf_width, lower = 5, upper = 99) |
|
3295 |
+ checkmate::assertNumeric(pdf_height, lower = 5, upper = 99) |
|
3272 | 3296 |
checkmate::assertNumeric(maxRowsToPlot) |
3273 |
- checkmate::assertFlag(useDefaultMetadata) |
|
3274 | 3297 |
checkmate::assertSubset(graph, c("TF-gene", "TF-peak-gene")) |
3275 | 3298 |
checkmate::assertSubset(colorby, c("type", "community")) |
3276 | 3299 |
checkmate::assertFlag(layered) |
3277 |
- checkmate::assertNumeric(pdf_width, lower = 5, upper = 99) |
|
3278 |
- checkmate::assertNumeric(pdf_height, lower = 5, upper = 99) |
|
3279 |
- |
|
3280 |
- |
|
3281 |
- futile.logger::flog.info(paste0("Plotting GRN network", dplyr::if_else(is.null(file), "", paste0(" to file ", file)))) |
|
3282 |
- |
|
3283 |
- if (useDefaultMetadata) { |
|
3284 |
- metadata_visualization.l = getBasic_metadata_visualization(GRN) |
|
3285 |
- vertice_color_TFs = list(metadata_visualization.l[["RNA_expression_TF"]], "HOCOID", "baseMean_log") |
|
3286 |
- vertice_color_genes = list(metadata_visualization.l[["RNA_expression_genes"]], "ENSEMBL_ID", "baseMean_log") |
|
3287 |
- vertice_color_peaks = list(metadata_visualization.l[["Peaks_accessibility"]], "peakID", "mean_log") |
|
3288 |
- } |
|
3289 |
- |
|
3290 |
- |
|
3291 |
- |
|
3300 |
+ checkmate::assertList(vertice_color_TFs) |
|
3301 |
+ checkmate::assertNames(names(vertice_color_TFs), must.include = c("h", "c", "l"), subset.of = c("h", "c", "l")) |
|
3302 |
+ checkmate::assertList(vertice_color_peaks) |
|
3303 |
+ checkmate::assertNames(names(vertice_color_peaks), must.include = c("h", "c", "l"), subset.of = c("h", "c", "l")) |
|
3304 |
+ checkmate::assertList(vertice_color_genes) |
|
3305 |
+ checkmate::assertNames(names(vertice_color_genes), must.include = c("h", "c", "l"), subset.of = c("h", "c", "l")) |
|
3306 |
+ checkmate::assertNumeric(vertexLabel_cex) |
|
3307 |
+ checkmate::assertNumeric(vertexLabel_dist) |
|
3308 |
+ checkmate::assertFlag(forceRerun) |
|
3309 |
+ |
|
3310 |
+ |
|
3311 |
+ outputFolder = .checkOutputFolder(GRN, outputFolder) |
|
3312 |
+ |
|
3313 |
+ # if (useDefaultMetadata) { |
|
3314 |
+ # metadata_visualization.l = getBasic_metadata_visualization(GRN) |
|
3315 |
+ # vertice_color_TFs = list(metadata_visualization.l[["RNA_expression_TF"]], "HOCOID", "baseMean_log") |
|
3316 |
+ # vertice_color_genes = list(metadata_visualization.l[["RNA_expression_genes"]], "ENSEMBL_ID", "baseMean_log") |
|
3317 |
+ # vertice_color_peaks = list(metadata_visualization.l[["Peaks_accessibility"]], "peakID", "mean_log") |
|
3318 |
+ # } |
|
3319 |
+ # |
|
3292 | 3320 |
#grn.merged = getGRNConnections(GRN, permuted = permuted, type = "all.filtered") |
3293 | 3321 |
# check that it's in sync with the @ graph |
3294 | 3322 |
if (graph == "TF-gene"){ |
... | ... |
@@ -3335,10 +3363,10 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul |
3335 | 3363 |
|
3336 | 3364 |
|
3337 | 3365 |
if (plotAsPDF) { |
3338 |
- grDevices::pdf(file = file, width = pdf_width, height = pdf_height) |
|
3339 |
- futile.logger::flog.info(paste0("Plotting to file ", file)) |
|
3366 |
+ futile.logger::flog.info(paste0("Plotting GRN network to ", outputFolder, dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_network"), basenameOutput),".pdf")) |
|
3367 |
+ grDevices::pdf(file = paste0(outputFolder,"/", ifelse(is.null(basenameOutput), .getOutputFileName("plot_network"), basenameOutput),".pdf"),width = pdf_width, height = pdf_height ) |
|
3340 | 3368 |
} else { |
3341 |
- futile.logger::flog.info(paste0("Plotting directly")) |
|
3369 |
+ futile.logger::flog.info(paste0("Plotting GRN network")) |
|
3342 | 3370 |
} |
3343 | 3371 |
|
3344 | 3372 |
if (nRows > maxRowsToPlot) { |
... | ... |
@@ -3347,7 +3375,7 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul |
3347 | 3375 |
message = paste0(title, "\n\nPlotting omitted.\n\nThe number of rows in the GRN (", nRows, ") exceeds the maximum of ", maxRowsToPlot, ".\nSee the maxRowsToPlot parameter to increase the limit") |
3348 | 3376 |
text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red") |
3349 | 3377 |
|
3350 |
- if (!is.null(file)) { |
|
3378 |
+ if (plotAsPDF) { |
|
3351 | 3379 |
grDevices::dev.off() |
3352 | 3380 |
} |
3353 | 3381 |
|
... | ... |
@@ -3394,30 +3422,31 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul |
3394 | 3422 |
nBins_discard = 25 |
3395 | 3423 |
nBins_real = nBins_orig - nBins_discard |
3396 | 3424 |
|
3397 |
- if (!is.null(vertice_color_TFs)) { |
|
3398 |
- color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = 10, c = 85, l = c(25, 95)))[(nBins_discard + 1):nBins_orig] # red |
|
3399 |
- colors_categories.l[["TF"]] = c(color_gradient[1], color_gradient[nBins_real]) |
|
3400 |
- colors_categories.l[["TF"]] = color_gradient |
|
3401 |
- symbols_categories.l[["TF"]] = c(15,NA,15) |
|
3402 |
- } |
|
3425 |
+ #if (!is.null(vertice_color_TFs)) { |
|
3426 |
+ color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = vertice_color_TFs[["h"]], c = vertice_color_TFs[["c"]], l = vertice_color_TFs[["l"]]))[(nBins_discard + 1):nBins_orig] # red |
|
3427 |
+ colors_categories.l[["TF"]] = c(color_gradient[1], color_gradient[nBins_real]) |
|
3428 |
+ colors_categories.l[["TF"]] = color_gradient |
|
3429 |
+ symbols_categories.l[["TF"]] = c(15,NA,15) |
|
3430 |
+ vertice_color_TFs = append(list(metadata_visualization.l[["RNA_expression_TF"]], "HOCOID", "baseMean_log"), vertice_color_TFs) |
|
3431 |
+ #} |
|
3403 | 3432 |
|
3404 | 3433 |
if(graph == "TF-peak-gene"){ |
3405 |
- |
|
3406 |
- if (!is.null(vertice_color_peaks)) { |
|
3407 |
- color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = 135, c = 45, l = c(35, 95)))[(nBins_discard + 1):nBins_orig] # green |
|
3408 |
- colors_categories.l[["PEAK"]] = c(color_gradient[1], color_gradient[nBins_real]) |
|
3409 |
- colors_categories.l[["PEAK"]] = color_gradient |
|
3410 |
- symbols_categories.l[["PEAK"]] = c(21,NA,21) |
|
3411 |
- } |
|
3412 |
- |
|
3434 |
+ #if (!is.null(vertice_color_peaks)) { |
|
3435 |
+ color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = vertice_color_peaks[["h"]], c = vertice_color_peaks[["c"]], l = vertice_color_peaks[["l"]]))[(nBins_discard + 1):nBins_orig] # green |
|
3436 |
+ colors_categories.l[["PEAK"]] = c(color_gradient[1], color_gradient[nBins_real]) |
|
3437 |
+ colors_categories.l[["PEAK"]] = color_gradient |
|
3438 |
+ symbols_categories.l[["PEAK"]] = c(21,NA,21) |
|
3439 |
+ vertice_color_peaks = append(list(metadata_visualization.l[["Peaks_accessibility"]], "peakID", "mean_log"), vertice_color_peaks) |
|
3440 |
+ # } |
|
3413 | 3441 |
} |
3414 | 3442 |
|
3415 |
- if (!is.null(vertice_color_genes)) { |
|
3416 |
- color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = 260, c = 80, l = c(30, 90)))[(nBins_discard + 1):nBins_orig] # blue |
|
3417 |
- colors_categories.l[["GENE"]] = c(color_gradient[1], color_gradient[nBins_real]) |
|
3418 |
- colors_categories.l[["GENE"]] = color_gradient |
|
3419 |
- symbols_categories.l[["GENE"]] = c(21,NA,21) |
|
3420 |
- } |
|
3443 |
+ #if (!is.null(vertice_color_genes)) { |
|
3444 |
+ color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = vertice_color_genes[["h"]], c = vertice_color_genes[["c"]], l = vertice_color_genes[["l"]]))[(nBins_discard + 1):nBins_orig] # blue |
|
3445 |
+ colors_categories.l[["GENE"]] = c(color_gradient[1], color_gradient[nBins_real]) |
|
3446 |
+ colors_categories.l[["GENE"]] = color_gradient |
|
3447 |
+ symbols_categories.l[["GENE"]] = c(21,NA,21) |
|
3448 |
+ vertice_color_genes = append(list(metadata_visualization.l[["RNA_expression_genes"]], "ENSEMBL_ID", "baseMean_log"), vertice_color_genes) |
|
3449 |
+ #} |
|
3421 | 3450 |
|
3422 | 3451 |
## VERTICES ## |
3423 | 3452 |
|
... | ... |
@@ -3637,9 +3666,9 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul |
3637 | 3666 |
ncommunities = length(unique(GRN@graph$TF_gene$clusterGraph$membership)) |
3638 | 3667 |
|
3639 | 3668 |
if (ncommunities >=8){ |
3640 |
- community_colors = data.frame(community = names(sort(table(GRN@graph$TF_gene$clusterGraph$membership), decreasing = T)[1:ncommunities]), |
|
3669 |
+ community_colors = data.frame(community = names(sort(table(GRN@graph$TF_gene$clusterGraph$membership), decreasing = T)[1:nCommunitiesMax]), |
|
3641 | 3670 |
color = rainbow(7)) |
3642 |
- fillercolors = data.frame(community = 8:ncommunities, color = "847E89") # only color the x largest communities |
|
3671 |
+ fillercolors = data.frame(community = nCommunitiesMax:ncommunities, color = "847E89") # only color the x largest communities |
|
3643 | 3672 |
community_colors = rbind(community_colors, fillercolors) |
3644 | 3673 |
|
3645 | 3674 |
}else{ |
... | ... |
@@ -3742,7 +3771,7 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul |
3742 | 3771 |
|
3743 | 3772 |
|
3744 | 3773 |
# Calling plot.new() might be necessary here |
3745 |
- if(is.null(file)){ |
|
3774 |
+ if(!plotAsPDF){ |
|
3746 | 3775 |
plot.new() |
3747 | 3776 |
} |
3748 | 3777 |
par(mar=c(7,0,0,0) + 0.2) |
... | ... |
@@ -3759,12 +3788,12 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul |
3759 | 3788 |
edge.width = igraph::E(net)$weight, |
3760 | 3789 |
vertex.label=igraph::V(net)$label, |
3761 | 3790 |
vertex.label.font=1, |
3762 |
- vertex.label.cex = 0.3, |
|
3791 |
+ vertex.label.cex = vertexLabel_cex, |
|
3763 | 3792 |
vertex.label.family="Helvetica", |
3764 | 3793 |
vertex.label.color = "black", |
3765 | 3794 |
vertex.label.degree= -pi/2, |
3766 | 3795 |
vertex.label=igraph::V(net)$label, |
3767 |
- vertex.label.dist= 0, |
|
3796 |
+ vertex.label.dist= vertexLabel_dist, |
|
3768 | 3797 |
vertex.shape = shape_vertex[igraph::V(net)$type], |
3769 | 3798 |
main = title |
3770 | 3799 |
) |
... | ... |
@@ -3834,22 +3863,20 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul |
3834 | 3863 |
|
3835 | 3864 |
} |
3836 | 3865 |
|
3837 |
- if (!is.null(file)) { |
|
3866 |
+ if (plotAsPDF) { |
|
3838 | 3867 |
grDevices::dev.off() |
3839 | 3868 |
} |
3840 | 3869 |
|
3841 | 3870 |
.printExecutionTime(start) |
3842 | 3871 |
|
3872 |
+ GRN |
|
3843 | 3873 |
} |
3844 | 3874 |
|
3845 | 3875 |
|
3846 | 3876 |
|
3847 |
- |
|
3848 |
- |
|
3849 |
- |
|
3850 | 3877 |
.verifyArgument_verticeType <- function(vertice_color_list) { |
3851 |
- |
|
3852 |
- checkmate::assertList(vertice_color_list, len = 3) |
|
3853 |
- checkmate::assertDataFrame(vertice_color_list[[1]]) |
|
3854 |
- checkmate::assertSubset(c(vertice_color_list[[2]], vertice_color_list[[3]]), colnames(vertice_color_list[[1]])) |
|
3878 |
+ |
|
3879 |
+ checkmate::assertList(vertice_color_list, len = 6) |
|
3880 |
+ checkmate::assertDataFrame(vertice_color_list[[1]]) |
|
3881 |
+ checkmate::assertSubset(c(vertice_color_list[[2]], vertice_color_list[[3]]), colnames(vertice_color_list[[1]])) |
|
3855 | 3882 |
} |
... | ... |
@@ -8,7 +8,7 @@ plotDiagnosticPlots_peakGene( |
8 | 8 |
GRN, |
9 | 9 |
outputFolder = NULL, |
10 | 10 |
basenameOutput = NULL, |
11 |
- gene.types = list(c("all"), c("protein_coding"), c("protein_coding", "lincRNA")), |
|
11 |
+ gene.types = list(c("protein_coding", "lincRNA")), |
|
12 | 12 |
useFiltered = FALSE, |
13 | 13 |
plotDetails = FALSE, |
14 | 14 |
plotPerTF = FALSE, |
... | ... |
@@ -26,7 +26,7 @@ plotDiagnosticPlots_peakGene( |
26 | 26 |
|
27 | 27 |
\item{basenameOutput}{\code{NULL} or character. Default \code{NULL}. Basename of the output files that are produced. If set to \code{NULL}, a default basename is chosen. If a custom basename is specified, all output files will have the chosen basename as file prefix, be careful with not overwriting already existing files (if \code{forceRerun} is set to \code{TRUE})} |
28 | 28 |
|
29 |
-\item{gene.types}{List of character vectors. Default list(c("all"), c("protein_coding"), c("protein_coding", "lincRNA")). Vectors of gene types to consider for the diagnostic plots. Multiple distinct combinations of gene types can be specified. For example, if set to \code{list(c("protein_coding", "lincRNA"), c("protein_coding"), c("all"))}, 3 distinct PDFs will be produced, one for each element of the list. The first file would only consider protein-coding and lincRNA genes, while the second plot only considers protein-coding ones. The special keyword "all" denotes all gene types found (usually, there are many gene types present, also more exotic and rare ones).} |
|
29 |
+\item{gene.types}{List of character vectors. Default list(c("protein_coding", "lincRNA")). Vectors of gene types to consider for the diagnostic plots. Multiple distinct combinations of gene types can be specified. For example, if set to \code{list(c("protein_coding", "lincRNA"), c("protein_coding"), c("all"))}, 3 distinct PDFs will be produced, one for each element of the list. The first file would only consider protein-coding and lincRNA genes, while the second plot only considers protein-coding ones. The special keyword "all" denotes all gene types found (usually, there are many gene types present, also more exotic and rare ones).} |
|
30 | 30 |
|
31 | 31 |
\item{useFiltered}{Logical. TRUE or FALSE. Default FALSE. If set to \code{FALSE}, the diagnostic plots will be produced based on all peak-gene connections. This is the default and will usually be best to judge whether the background behaves as expected. If set to TRUE, the diagnostic plots will be produced based on the filtered set of connections. For this, the function \code{link{filterGRNAndConnectGenes}} must have been run before.} |
32 | 32 |
|
... | ... |
@@ -25,7 +25,7 @@ plotPCA_all( |
25 | 25 |
|
26 | 26 |
\item{basenameOutput}{\code{NULL} or character. Default \code{NULL}. Basename of the output files that are produced. If set to \code{NULL}, a default basename is chosen. If a custom basename is specified, all output files will have the chosen basename as file prefix, be careful with not overwriting already existing files (if \code{forceRerun} is set to \code{TRUE})} |
27 | 27 |
|
28 |
-\item{data}{Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data).} |
|
28 |
+\item{data}{Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Default \code{c("rna", "peaks")}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data).} |
|
29 | 29 |
|
30 | 30 |
\item{topn}{Integer vector. Default \code{c(500,1000,5000)}. Number of top variable features to do PCA for. Can be a vector of different numbers (see default).} |
31 | 31 |
|
... | ... |
@@ -2,56 +2,68 @@ |
2 | 2 |
% Please edit documentation in R/plot.R |
3 | 3 |
\name{visualizeGRN} |
4 | 4 |
\alias{visualizeGRN} |
5 |
-\title{Visualize a filtered GRN (in development)} |
|
5 |
+\title{Visualize a filtered GRN.} |
|
6 | 6 |
\usage{ |
7 | 7 |
visualizeGRN( |
8 | 8 |
GRN, |
9 |
- file, |
|
9 |
+ outputFolder = NULL, |
|
10 |
+ basenameOutput = NULL, |
|
11 |
+ plotAsPDF = TRUE, |
|
12 |
+ pdf_width = 12, |
|
13 |
+ pdf_height = 12, |
|
10 | 14 |
title = NULL, |
11 | 15 |
maxRowsToPlot = 500, |
12 |
- useDefaultMetadata = TRUE, |
|
13 | 16 |
graph = "TF-gene", |
14 | 17 |
colorby = "type", |
15 | 18 |
layered = FALSE, |
16 |
- vertice_color_TFs = NULL, |
|
17 |
- vertice_color_peaks = NULL, |
|
18 |
- vertice_color_genes = NULL, |
|
19 |
- plotAsPDF = TRUE, |
|
20 |
- pdf_width = 12, |
|
21 |
- pdf_height = 12, |
|
19 |
+ vertice_color_TFs = list(h = 10, c = 85, l = c(25, 95)), |
|
20 |
+ vertice_color_peaks = list(h = 135, c = 45, l = c(35, 95)), |
|
21 |
+ vertice_color_genes = list(h = 260, c = 80, l = c(30, 90)), |
|
22 |
+ vertexLabel_cex = 0.4, |
|
23 |
+ vertexLabel_dist = 0, |
|
22 | 24 |
forceRerun = FALSE |
23 | 25 |
) |
24 | 26 |
} |
25 | 27 |
\arguments{ |
26 | 28 |
\item{GRN}{Object of class \code{\linkS4class{GRN}}} |
27 | 29 |
|
28 |
-\item{file}{TODO} |
|
30 |
+\item{outputFolder}{Character or \code{NULL}. Default \code{NULL}. If set to \code{NULL}, the default output folder as specified when initiating the object in \code{link{initializeGRN}} will be used. Otherwise, all output from this function will be put into the specified folder. We recommend specifying an absolute path.} |
|
29 | 31 |
|
30 |
-\item{title}{TODO} |
|
32 |
+\item{basenameOutput}{\code{NULL} or character. Default \code{NULL}. Basename of the output files that are produced. If set to \code{NULL}, a default basename is chosen. If a custom basename is specified, all output files will have the chosen basename as file prefix, be careful with not overwriting already existing files (if \code{forceRerun} is set to \code{TRUE})} |
|
31 | 33 |
|
32 |
-\item{maxRowsToPlot}{TODO} |
|
34 |
+\item{plotAsPDF}{\code{TRUE} or \code{FALSE}. Default \code{TRUE}.Should the plots be printed to a PDF file? If set to \code{TRUE}, a PDF file is generated, the name of which depends on the value of \code{basenameOutput}. If set to \code{FALSE}, all plots are printed to the currently active device. Note that most functions print more than one plot, which means you may only see the last plot depending on your active graphics device.} |
|
33 | 35 |
|
34 |
-\item{useDefaultMetadata}{TODO} |
|
36 |
+\item{pdf_width}{Number. Default 12. Width of the PDF, in cm.} |
|
35 | 37 |
|
36 |
-\item{vertice_color_TFs}{TODO} |
|
38 |
+\item{pdf_height}{Number. Default 12. Height of the PDF, in cm.} |
|
37 | 39 |
|
38 |
-\item{vertice_color_peaks}{TODO} |
|
40 |
+\item{title}{NULL or Character. Default NULL. Title to be assigned to the plot.} |
|
39 | 41 |
|
40 |
-\item{vertice_color_genes}{TODO} |
|
42 |
+\item{maxRowsToPlot}{Numeric. Default 500. Refers to the maximum number of connections to be plotted.} |
|
41 | 43 |
|
42 |
-\item{plotAsPDF}{\code{TRUE} or \code{FALSE}. Default \code{TRUE}.Should the plots be printed to a PDF file? If set to \code{TRUE}, a PDF file is generated, the name of which depends on the value of \code{basenameOutput}. If set to \code{FALSE}, all plots are printed to the currently active device. Note that most functions print more than one plot, which means you may only see the last plot depending on your active graphics device.} |
|
44 |
+\item{graph}{Character. Default \code{TF-gene}. One of: \code{TF-gene}, \code{TF-peak-gene}. Whether to plot a graph with links from TFs to peaks to gene, or the graph with the inferred TF to gene connections.} |
|
43 | 45 |
|
44 |
-\item{pdf_width}{Number. Default 12. Width of the PDF, in cm.} |
|
46 |
+\item{colorby}{Character. Default \code{type}. One of \code{type}, code \code{community}. Color the vertices by either type (TF/peak/gene) or community. See \code{\link{calculateCommunitiesStats}}} |
|
45 | 47 |
|
46 |
-\item{pdf_height}{Number. Default 12. Height of the PDF, in cm.} |
|
48 |
+\item{layered}{Boolean. Default \code{FALSE}. Display the network in a layered format where each layer corresponds to a node type (TF/peak/gene).} |
|
49 |
+ |
|
50 |
+\item{vertice_color_TFs}{Named list. Default \code{list(h = 10, c = 85, l = c(25, 95))}. The list must specify the color in hcl format (hue, chroma, luminence). See the \code{\link[colorspace]} package for more details and examples} |
|
51 |
+ |
|
52 |
+\item{vertice_color_peaks}{Named list. Default \code{list(h = 135, c = 45, l = c(35, 95))}.} |
|
53 |
+ |
|
54 |
+\item{vertice_color_genes}{Named list. Default \code{list(h = 260, c = 80, l = c(30, 90))}.} |
|
55 |
+ |
|
56 |
+\item{vertexLabel_cex}{Numeric. Default \code{0.4}. Font size (multiplication factor, device-dependent)} |
|
57 |
+ |
|
58 |
+\item{vertexLabel_dist}{Numeric. Default \code{0} vertex. Distance between the label and the vertex.} |
|
47 | 59 |
|
48 | 60 |
\item{forceRerun}{\code{TRUE} or \code{FALSE}. Default \code{FALSE}. Force execution, even if the GRN object already contains the result. Overwrites the old results.} |
49 | 61 |
|
50 | 62 |
\item{permuted}{\code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should the permuted data be taken (\code{TRUE}) or the non-permuted, original one (\code{FALSE})?} |
51 | 63 |
} |
52 | 64 |
\value{ |
53 |
-TODO |
|
65 |
+the GRN object |
|
54 | 66 |
} |
55 | 67 |
\description{ |
56 |
-Visualize a filtered GRN (in development) |
|
68 |
+Visualize a filtered GRN. |
|
57 | 69 |
} |
... | ... |
@@ -4,7 +4,7 @@ author: "Christian Arnold, Judith Zaugg, Rim Moussa" |
4 | 4 |
date: "`r BiocStyle::doc_date()`" |
5 | 5 |
package: "`r BiocStyle::pkg_ver('GRaNIE')`" |
6 | 6 |
abstract: > |
7 |
- This workflow vignette shows how to use the `GRaNIE` package in a real-world example. For this purpose, you will use real data that is soon available in the `GRaNIEData` package for a more complex analysis to illustrate most of its features. Importantly, you will also learn in detail how to work with a `GRaNIE` object and what its main functions and properties are. The vignette will be continuously updated whenever new functionality becomes available or when we receive user feedback. |
|
7 |
+ This workflow vignette shows how to use the `GRaNIE` package in a real-world example. For this purpose, you will use real data for a more complex analysis to illustrate most of its features. Importantly, you will also learn in detail how to work with a `GRaNIE` object and what its main functions and properties are. The vignette will be continuously updated whenever new functionality becomes available or when we receive user feedback. |
|
8 | 8 |
vignette: > |
9 | 9 |
%\VignetteIndexEntry{Workflow example} |
10 | 10 |
%\VignetteEngine{knitr::rmarkdown} |
... | ... |
@@ -53,10 +53,25 @@ pre[class] { |
53 | 53 |
} |
54 | 54 |
``` |
55 | 55 |
|
56 |
+# Example data |
|
57 |
+ |
|
58 |
+The data is taken from [here](https://zenodo.org/record/1188300#.X370PXUzaSN) [1], has been processed to meet the requirements of the `GRaNIE` package and consists of the following files: |
|
59 |
+ |
|
60 |
+* ATAC-seq peaks, raw counts (originally around 75,000, genome-wide, here filtered to around 60,500) |
|
61 |
+* RNA-Seq data, raw counts (originally for around 35,000 genes, here filtered to around 19,000) |
|
62 |
+* sample metadata with additional sample-specific information |
|
63 |
+ |
|
64 |
+In general, the dataset is from human macrophages (both naive and IFNg primed) of healthy individuals and various stimulations / infections (naive vs primed and infected with SL1344 vs not), with 4 groups in total: control/infected(SL1344) and naive/primed(IFNg) |
|
65 |
+ |
|
66 |
+ |
|
67 |
+Furthermore, the example dataset is accompanied by the following files: |
|
68 |
+ |
|
69 |
+* genome-wide transcription factor binding site predictions for 75 selected TFs, along with a translation table to link TF names to their corresponding Ensembl IDs. For each TF, a gzipped BED file has been created with predicted TF binding sites. The files have been generated with `PWMScan` and the `HOCOMOCO` database, see [2] for details. |
|
70 |
+ |
|
56 | 71 |
# Example Workflow |
57 | 72 |
<a name="section1"></a> |
58 | 73 |
|
59 |
-In the following example, you will use data from the `GRaNIEData` package (soon available) to construct a *eGRN* from ATAC-seq, RNA-seq data as well transcription factor data. |
|
74 |
+In the following example, you will use the example data to construct a *eGRN* from ATAC-seq, RNA-seq data as well transcription factor data. |
|
60 | 75 |
|
61 | 76 |
|
62 | 77 |
```{r <knitr, echo=FALSE, message=FALSE, results="hide", class.output="scroll-200"} |
... | ... |
@@ -75,14 +90,13 @@ For reasons of brevity, we omit the output of this code chunk. |
75 | 90 |
|
76 | 91 |
```{r loadLibaries2, echo=TRUE, message=FALSE, results="hide", warning=FALSE, class.output="scroll-200"} |
77 | 92 |
library(tidyverse) |
78 |
-#library(GRaNIEData) |
|
79 | 93 |
library(GRaNIE) |
80 | 94 |
``` |
81 | 95 |
|
82 | 96 |
|
83 | 97 |
## General notes |
84 | 98 |
|
85 |
-Each of the `GRaNIE` functions we mention here in this Vignette comes with sensible default parameters that we found to work well for most of the datasets we tested it with so far. However, **always check the validity and usefulness of the parameters before starting an analysis** to avoid unreasonable results. |
|
99 |
+Each of the `GRaNIE` functions we mention here in this Vignette comes with sensible default parameters that we found to work well for most of the datasets we tested it with so far. For the purpose of this Vignette, however, and the resulting running times, we here try to provide a good compromise between biological necessity and computational efficiacy. However, **always check the validity and usefulness of the parameters before starting an analysis** to avoid unreasonable results. |
|
86 | 100 |
|
87 | 101 |
## Reading the data required for the `GRaNIE` package |
88 | 102 |
|
... | ... |
@@ -101,8 +115,8 @@ So, let's import the enhancer and RNA-seq data as a data frame as well as some s |
101 | 115 |
```{r importData, echo=TRUE, include=TRUE, class.output="scroll-200"} |
102 | 116 |
|
103 | 117 |
# We load the example data directly from the web: |
104 |
-file_peaks = "https://www.embl.de/download/zaugg/GRaNIE/peaks.tsv.gz" |
|
105 |
-file_RNA = "https://www.embl.de/download/zaugg/GRaNIE/rna.tsv.gz" |
|
118 |
+file_peaks = "https://www.embl.de/download/zaugg/GRaNIE/countsATAC.filtered.tsv.gz" |
|
119 |
+file_RNA = "https://www.embl.de/download/zaugg/GRaNIE/countsRNA.filtered.tsv.gz" |
|
106 | 120 |
file_sampleMetadata = "https://www.embl.de/download/zaugg/GRaNIE/sampleMetadata.tsv.gz" |
107 | 121 |
|
108 | 122 |
countsRNA.df = read_tsv(file_RNA, col_types = cols()) |
... | ... |
@@ -119,7 +133,6 @@ sampleMetadata.df |
119 | 133 |
idColumn_peaks = "peakID" |
120 | 134 |
idColumn_RNA = "ENSEMBL" |
121 | 135 |
|
122 |
- |
|
123 | 136 |
``` |
124 | 137 |
|
125 | 138 |
While we recommend raw counts for both enhancers and RNA-Seq as input and offer several normalization choices in the pipeline, it is also possible to provide pre-normalized data. Note that the normalization method may have a large influence on the resulting *eGRN* network, so make sure the choice of normalization is reasonable. For more details, see the next sections. |
... | ... |
@@ -145,7 +158,7 @@ objectMetadata.l = list(name = paste0("Macrophages_infected_prime |
145 | 158 |
file_sampleMetadata = file_sampleMetadata, |
146 | 159 |
genomeAssembly = genomeAssembly) |
147 | 160 |
|
148 |
-dir_output = "output" |
|
161 |
+dir_output = "." |
|
149 | 162 |
|
150 | 163 |
GRN = initializeGRN(objectMetadata = objectMetadata.l, |
151 | 164 |
outputFolder = dir_output, |
... | ... |
@@ -190,34 +203,34 @@ Note that while this step is recommended to do, it is fully optional from a work |
190 | 203 |
|
191 | 204 |
|
192 | 205 |
```{r runPCA, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200", collapse=FALSE, results="hold"} |
193 |
-GRN = plotPCA_all(GRN, data = c("rna"), topn = 500, type = "raw", plotAsPDF = FALSE, pages = c(3,4), forceRerun = TRUE) |
|
206 |
+GRN = plotPCA_all(GRN, data = c("rna"), topn = 500, type = "raw", plotAsPDF = FALSE, pages = c(2,3,14), forceRerun = TRUE) |
|
194 | 207 |
``` |
195 | 208 |
|
196 |
-Depending on the parameters, multiple output plots may be produced, with up to two files for each of the specified `data` modalities (that is, RNA-Seq counts, as specified with `rna` here, as well as the peak counts, `peaks`, not done here for reasons of brevity). For each of them, PCA plots can be produced for both `raw` and `normalized` data (here: only `raw`). With `raw`, we here denote the original counts as given as input with the `addData()` function, irrespective of whether this was already pre-normalized or not. For more details, see the [Package Details Vignette](packageDetails.html). The `topn` argument specifies the number of top variable features to do PCA for - here 500. |
|
209 |
+Depending on the parameters, multiple output files (and plots) may be produced, with up to two files for each of the specified `data` modalities (that is, RNA-Seq counts, as specified with `rna` here, as well as the peak counts, `peaks`, not done here for reasons of brevity). For each of them, PCA plots can be produced for both `raw` and `normalized` data (here: only `raw`). With `raw`, we here denote the original counts as given as input with the `addData()` function, irrespective of whether this was already pre-normalized or not. The `topn` argument specifies the number of top variable features to do PCA for - here 500. |
|
197 | 210 |
|
198 |
-We do not show any of the PCA plots here for reasons of brevity, but make sure to examine these plots closely! For all details, which plota are produced and further comments on how to understand and interpret them, see the [Package Details Vignette](packageDetails.html). |
|
211 |
+There are more plots that are generated, make sure to examine these plots closely! For all details, which plots are produced and further comments on how to understand and interpret them, see the [Package Website](https://grp-zaugg.embl-community.io/GRaNIE/). |
|
199 | 212 |
|
200 | 213 |
|
201 | 214 |
## Add TFs and TFBS and overlap with peak |
202 | 215 |
|
203 |
-Now it is time to add data for TFs and predicted TF binding sites (TFBS)! Our `GRaNIE` package requires pre-computed TFBS that need to be in a specific format (see the [Package Details Vignette](packageDetails.html) for details). In brief, a 6-column bed file must be present for each TF, with a specific file name that starts with the name of the TF, an arbitrary and optional suffix (here: "_TFBS") and a particular file ending (supported are `bed` or `bed.gz`; here, we specify the latter). All these files must be located in a particular folder that the `addTFBS()` functions then searches in order to identify those files that match the specified patterns. We provide example TFBS for the 3 genome assemblies we support, see the comment below and the [Package Details Vignette](packageDetails.html) for details. After setting this up, we are ready to overlap the TFBS and the peaks by calling the function `overlapPeaksAndTFBS()`. |
|
216 |
+Now it is time to add data for TFs and predicted TF binding sites (TFBS)! Our `GRaNIE` package requires pre-computed TFBS that need to be in a specific format. In brief, a 6-column bed file must be present for each TF, with a specific file name that starts with the name of the TF, an arbitrary and optional suffix (here: `_TFBS`) and a particular file ending (supported are `bed` or `bed.gz`; here, we specify the latter). All these files must be located in a particular folder that the `addTFBS()` functions then searches in order to identify those files that match the specified patterns. We provide example TFBS for the 3 genome assemblies we support. After setting this up, we are ready to overlap the TFBS and the peaks by calling the function `overlapPeaksAndTFBS()`. |
|
204 | 217 |
|
205 | 218 |
For more parameter details, see the R help (`?addTFBS` and `?overlapPeaksAndTFBS`). |
206 | 219 |
|
207 | 220 |
```{r addTFBS, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200", results='hold'} |
208 | 221 |
|
209 |
-folder_TFBS_first50 = "TFBS_selected" |
|
222 |
+folder_TFBS_first50 = "https://www.embl.de/download/zaugg/GRaNIE/TFBS_selected.zip" |
|
210 | 223 |
# Download the zip of all TFBS files. Takes too long here, not executed therefore |
211 |
-# filename <- "TFBS_selected.zip" |
|
212 |
-# download.file("https://www.embl.de/download/zaugg/GRaNIE/TFBS_selected.zip", file.path(filename), quiet = FALSE) |
|
213 |
-# unzip(file.path(filename), overwrite = TRUE, exdir = file.path("TFBS_selected")) |
|
224 |
+# download.file(folder_TFBS_first50, file.path("TFBS_selected.zip"), quiet = FALSE) |
|
225 |
+# unzip(file.path("TFBS_selected.zip"), overwrite = TRUE) |
|
226 |
+# motifFolder = tools::file_path_as_absolute("TFBS_selected") |
|
214 | 227 |
|
215 |
-GRN = addTFBS(GRN, motifFolder = folder_TFBS_first50, TFs = "all", filesTFBSPattern = "_TFBS", fileEnding = ".bed.gz", forceRerun = TRUE) |
|
228 |
+GRN = addTFBS(GRN, motifFolder = motifFolder, TFs = "all", filesTFBSPattern = "_TFBS", fileEnding = ".bed.gz", forceRerun = TRUE) |
|
216 | 229 |
|
217 | 230 |
GRN = overlapPeaksAndTFBS(GRN, nCores = 1, forceRerun = TRUE) |
218 | 231 |
``` |
219 | 232 |
|
220 |
-We see from the output (omitted here for brevity) that 75 TFs have been found in the specified input folder, and the number of TFBS that overlap our peaks for each of them. We now successfully added our TFs and TFBS to the `GRaNIE` object. |
|
233 |
+We see from the output (omitted here for brevity) that 6 TFs have been found in the specified input folder, and the number of TFBS that overlap our peaks for each of them. We successfully added our TFs and TFBS to the `GRaNIE` object" |
|
221 | 234 |
|
222 | 235 |
|
223 | 236 |
|
... | ... |
@@ -247,7 +260,7 @@ We can see from the output that no peaks have been filtered due to their size an |
247 | 260 |
|
248 | 261 |
## Add TF-enhancer connections |
249 | 262 |
|
250 |
-We now have all necessary data in the object to start constructing our network. As explained in the [Package Details Vignette](packageDetails.html), we currently support two types of links for our `GRaNIE` approach: |
|
263 |
+We now have all necessary data in the object to start constructing our network. As explained elsewhere, we currently support two types of links for our `GRaNIE` approach: |
|
251 | 264 |
|
252 | 265 |
1. TF - enhancer |
253 | 266 |
2. enhancer - gene |
... | ... |
@@ -260,7 +273,7 @@ In addition to creating TF-enhancer links based on TF expression, we can also co |
260 | 273 |
|
261 | 274 |
Lastly, we offer a so called GC-correction that uses a GC-matching background to compare it with the foreground instead of using the full background as comparison. We are still investigating the plausibility and effects of this and therefore mark this feature as experimental as of now. |
262 | 275 |
|
263 |
-This function may run a while, and each time-consuming step has a built-in progress bar so the remaining time can be estimated. Note that the TF-enhancer links are constructed for both the original, non-permuted data (in the corresponding output plots that are produced, this is labeled as `original`) and permuted data (`permuted`). |
|
276 |
+Note that the TF-enhancer links are constructed for both the original, non-permuted data (in the corresponding output plots that are produced, this is labeled as `original`) and permuted data (`permuted`). |
|
264 | 277 |
For more parameter options and parameter details, see the R help (`?addConnections_TF_peak`). |
265 | 278 |
|
266 | 279 |
```{r addTFPeakConnections, echo=TRUE, include=TRUE, eval = TRUE, class.output="scroll-200", results='hold'} |
... | ... |
@@ -268,40 +281,41 @@ GRN = addConnections_TF_peak(GRN, plotDiagnosticPlots = FALSE, |
268 | 281 |
connectionTypes = c("expression"), |
269 | 282 |
corMethod = "pearson", forceRerun = TRUE) |
270 | 283 |
``` |
271 |
-From the output, we see that a total of 65 TFs have RNA-seq data available and consequently will be included and correlated with the enhancer accessibility. |
|
284 |
+From the output, we see that all of the 6 TFs also have RNA-seq data available and consequently will be included and correlated with the enhancer accessibility. |
|
272 | 285 |
|
273 | 286 |
## Quality control 2: Diagnostic plots for TF-enhancer connections |
274 | 287 |
|
275 |
-After adding the TF-enhancer links to our `GRaNIE` object, let's look at some diagnostic plots. Depending on the user parameters, the plots are either directly plotted to the currently active graphics device or to PDF files as specified in the object or via the function parameters. If plotted to a PDF, within the specified or default output folder (when initializing the `GRaNIE` object) should contain two new files that are named `TF_peak.fdrCurves_original.pdf` and `TF_peak.fdrCurves_permuted.pdf`, for example. For reasons of brevity and organization, we fully describe their interpretation and meaning in detail in the [Package Details Vignette](packageDetails.html#output_TF_peak) and not here, however. |
|
288 |
+After adding the TF-enhancer links to our `GRaNIE` object, let's look at some diagnostic plots. Depending on the user parameters, the plots are either directly plotted to the currently active graphics device or to PDF files as specified in the object or via the function parameters. If plotted to a PDF, within the specified or default output folder (when initializing the `GRaNIE` object) should contain two new files that are named `TF_peak.fdrCurves_original.pdf` and `TF_peak.fdrCurves_permuted.pdf`, for example. |
|
289 |
+ |
|
290 |
+This function may run a while, and each time-consuming step has a built-in progress bar for the plot-related parts so the remaining time can be estimated. |
|
276 | 291 |
|
277 |
-In summary, TF-enhancer diagnostic plots are available for each TF, and each page summarizes the QC for each TF in two plots: |
|
292 |
+For reasons of brevity and organization, we fully describe their interpretation and meaning in detail elsewhere, however. In summary, TF-enhancer diagnostic plots are available for each TF, and each page summarizes the QC for each TF in two plots: |
|
278 | 293 |
|
279 | 294 |
|
280 | 295 |
```{r, echo=FALSE} |
281 |
-GRN = plotDiagnosticPlots_TFPeaks(GRN, dataType = c("real", "permuted"), plotAsPDF = FALSE, pages = c(6)) |
|
296 |
+GRN = plotDiagnosticPlots_TFPeaks(GRN, dataType = c("real", "permuted"), plotAsPDF = FALSE, pages = 4) |
|
282 | 297 |
``` |
283 | 298 |
|
284 |
-TODO: correct |
|
285 |
-For this example, we can see a quite typical case: the TF-enhancer FDR for the various *ETV5.0.C* - enhancer pairs are above 0.2 for the wide majority correlation bins in both directions (that is, positive and negative), while a few bins towards more extreme correlation bins have a low FDR. The former indicate little statistical signal and confidence, while the latter are those connections we are looking for! Typically, however, only few connections are in the more extreme bins, as indicated by the color. Here, correlation bin refers to the correlation of a particular *ETV5.0.C* - enhancer pair that has been discretized accordingly (i.e., a correlation of 0.07 would go into (0.05-0.10] correlation bin). Usually, depending on the mode of action of a TF, none or either one of the two directions may show a low FDR in particular areas of the plots, often close to more extreme correlation bins, but rarely both. For a better judgement and interpretation, we can also check how this looks like for permuted data: |
|
299 |
+For this example, we can see a quite typical case: the TF-enhancer FDR for the various *EGR1.0.A* - enhancer pairs are above 0.2 for the wide majority correlation bins in both directions (that is, positive and negative), while a few bins towards more extreme correlation bins have a low FDR. The former indicate little statistical signal and confidence, while the latter are those connections we are looking for! Typically, however, only few connections are in the more extreme bins, as indicated by the color. Here, correlation bin refers to the correlation of a particular *EGR1.0.A* - enhancer pair that has been discretized accordingly (i.e., a correlation of 0.07 would go into (0.05-0.10] correlation bin). Usually, depending on the mode of action of a TF, none or either one of the two directions may show a low FDR in particular areas of the plots, often close to more extreme correlation bins, but rarely both. For a better judgement and interpretation, we can also check how this looks like for permuted data: |
|
286 | 300 |
|
287 | 301 |
|
288 | 302 |
We see that for permuted data, there is much less significant bins, as expected. |
289 | 303 |
|
290 |
-Here, in summary, a few positively correlated *ETV5.0.C* - enhancer pairs (with a correlation of at least 0.65 or so) are statistically significant and may be retained for the final *eGRN* network (if corresponding genes connecting to the respective enhancers are found). |
|
304 |
+Here, in summary, a few positively correlated *EGR1.0.A* - enhancer pairs (with a correlation of above 0.5 or so) are statistically significant and may be retained for the final *eGRN* network (if corresponding genes connecting to the respective enhancers are found). |
|
291 | 305 |
|
292 | 306 |
|
293 | 307 |
## Run the AR classification and QC (optional) {#output_AR} |
294 | 308 |
|
295 | 309 |
Transcription factors (TFs) regulate many cellular processes and can therefore serve as readouts of the signaling and regulatory state. Yet for many TFs, the mode of action—repressing or activating transcription of target genes—is unclear. In analogy to our *diffTF* approach that we recently published to calculate differential TF activity,the classification of TFs into putative transcriptional activators or repressors can also be done from within the `GRaNIE` framework in an identical fashion. This can be achieved with the function `AR_classification_wrapper()`. |
296 | 310 |
|
297 |
-**Note that this step is fully optional and can be skipped. The output of the function is not used for subsequent steps.**. To keep the memory footprint of the `GRaNIE` object low, we recommend to keep the function parameter default `deleteIntermediateData = TRUE`. Here, we specify to put all plots within `output/plots`. However, we do not actually run the code here. |
|
311 |
+**Note that this step is fully optional and can be skipped. The output of the function is not used for subsequent steps.**. To keep the memory footprint of the `GRaNIE` object low, we recommend to keep the function parameter default `deleteIntermediateData = TRUE`. Here, we specify to put all plots within `output/plots`. However, for reasons of brevity, we do not actually run the code here. |
|
298 | 312 |
|
299 | 313 |
|
300 | 314 |
```{r runARClassification, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200"} |
301 | 315 |
GRN = AR_classification_wrapper(GRN, significanceThreshold_Wilcoxon = 0.05, |
302 | 316 |
outputFolder = "output/plots", |
303 | 317 |
plot_minNoTFBS_heatmap = 100, plotDiagnosticPlots = TRUE, |
304 |
- deleteIntermediateData = TRUE, forceRerun = TRUE) |
|
318 |
+ forceRerun = TRUE) |
|
305 | 319 |
``` |
306 | 320 |
The classification runs for both real and permuted data, as before. The contents of these plots are identical to and uses in fact practically the same code as our `diffTF` software, and we therefore do not include them here. We refer to the following links for more details as well as the [Package Details Vignette](packageDetails.html#output_AR): |
307 | 321 |
|
... | ... |
@@ -340,7 +354,7 @@ GRN = addConnections_peak_gene(GRN, |
340 | 354 |
nCores = 1, plotDiagnosticPlots = FALSE, plotGeneTypes = list(c("all")), forceRerun = TRUE) |
341 | 355 |
``` |
342 | 356 |
|
343 |
-We see from the output that almost 38,000 enhancer-gene links have been identified that match our parameters. From these 38,000, however, only around 16,351 actually had corresponding RNA-seq data available, while RNA-seq data was missing or has been filtered for the other. This is a rather typical case, as not all known and annotated genes are included in the RNA-seq data in the first place. Similar to before, the correlations have also been performed for the permuted data. |
|
357 |
+We see from the output that almost 38,000 enhancer-gene links have been identified that match our parameters. However, only around 16,351 actually had corresponding RNA-seq data available, while RNA-seq data was missing or has been filtered for the other. This is a rather typical case, as not all known and annotated genes are included in the RNA-seq data in the first place. Similar to before, the correlations have also been performed for the permuted data. |
|
344 | 358 |
|
345 | 359 |
|
346 | 360 |
## Quality control 3: Diagnostic plots for enhancer-gene connections |
... | ... |
@@ -406,9 +420,15 @@ GRN_connections.all |
406 | 420 |
The table contains many columns, and the prefix of each column name indicates the part of the *eGRN* network that the column refers to (e.g., TFs, TF-enhancers, enhancers, enhancer-genes or genes, or TF-gene if the function `add_TF_gene_correlation()` has been run before). Data are stored in a format that minimizes the memory footprint (e.g., each character column is stored as a factor). This table can now be used for any downstream analysis, as it is just a normal data frame. |
407 | 421 |
|
408 | 422 |
|
409 |
-## Visualize the filtered *eGRN* connections |
|
423 |
+## Visualize the filtered *eGRN* |
|
424 |
+ |
|
425 |
+The `GRaNIE` package also offers a function to visualize a filtered *eGRN* network! It is very easy to invoke, but provides many options to customize the output and the way the graph is drawn. We recommend to explore the options in the R help (`?getGRNConnections`). |
|
426 |
+ |
|
427 |
+ |
|
428 |
+```{r visualizeGRN, echo=TRUE, include=TRUE, eval = TRUE, class.output="scroll-200"} |
|
429 |
+GRN = visualizeGRN(GRN) |
|
430 |
+``` |
|
410 | 431 |
|
411 |
-The `GRaNIE` package will soon also offer some rudimentary functions to visualize a filtered *eGRN* network. Stay tuned! Meanwhile, you can use the `igraph` package to construct a graph out of the filtered TF-enhancer-gene connection table (see above). |
|
412 | 432 |
|
413 | 433 |
## Generate a connection summary for filtered connections |
414 | 434 |
|
... | ... |
@@ -416,10 +436,10 @@ It is often useful to get a grasp of the general connectivity of a network and t |
416 | 436 |
|
417 | 437 |
```{r connectionSummary, echo=TRUE, include=TRUE, eval = TRUE, class.output="scroll-200"} |
418 | 438 |
GRN = generateStatsSummary(GRN, |
419 |
- TF_peak.fdr = c(0.01, 0.05, 0.1, 0.2), |
|
439 |
+ TF_peak.fdr = c(0.05, 0.1, 0.2), |
|
420 | 440 |
TF_peak.connectionTypes = "all", |
421 | 441 |
peak_gene.p_raw = NULL, |
422 |
- peak_gene.fdr = c(0.01, 0.05, 0.1, 0.2), |
|
442 |
+ peak_gene.fdr = c(0.1, 0.2), |
|
423 | 443 |
peak_gene.r_range = c(0,1), |
424 | 444 |
allowMissingGenes = c(FALSE, TRUE), |
425 | 445 |
allowMissingTFs = c(FALSE), |