Browse code

Major changes related to Vignette and package size reduction, part 2

Christian Arnold authored on 26/03/2022 00:49:08
Showing7 changed files

... ...
@@ -35,6 +35,7 @@ export(plotGeneralGraphStats)
35 35
 export(plotPCA_all)
36 36
 export(plotTFEnrichment)
37 37
 export(plot_stats_connectionSummary)
38
+export(visualizeGRN)
38 39
 import(BiocFileCache)
39 40
 import(BiocManager)
40 41
 import(GenomicRanges)
... ...
@@ -2580,11 +2580,11 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2580 2580
     
2581 2581
     
2582 2582
     # Finally, do the actual overlaps
2583
-    overlapsAll = GenomicRanges::findOverlaps(query, subject, 
2583
+    overlapsAll = suppressWarnings(GenomicRanges::findOverlaps(query, subject, 
2584 2584
                                               minoverlap = 1,
2585 2585
                                               type = "any",
2586 2586
                                               select = "all",
2587
-                                              ignore.strand = TRUE)
2587
+                                              ignore.strand = TRUE))
2588 2588
     
2589 2589
     
2590 2590
     query_row_ids  = S4Vectors::queryHits(overlapsAll)
... ...
@@ -4211,7 +4211,8 @@ getGRNConnections <- function(GRN, type = "all.filtered",  permuted = FALSE, inc
4211 4211
     "plot_communityStats"            = "GRN.community_stats",
4212 4212
     "plot_communityEnrichment"       = "GRN.community_enrichment",
4213 4213
     "plot_generalNetworkStats"       = "GRN.overall_stats",
4214
-    "plot_TFEnrichment"              = "GRN.TF_enrichment"
4214
+    "plot_TFEnrichment"              = "GRN.TF_enrichment",
4215
+    "plot_network"                   = "GRN.network_visualisation"
4215 4216
     
4216 4217
   )
4217 4218
   
... ...
@@ -6,7 +6,7 @@
6 6
 #'
7 7
 #' @template GRN 
8 8
 #' @template outputFolder
9
-#' @param data Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data).
9
+#' @param data Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Default \code{c("rna", "peaks")}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data).
10 10
 #' @param topn Integer vector. Default \code{c(500,1000,5000)}. Number of top variable features to do PCA for. Can be a vector of different numbers (see default).
11 11
 #' @param type Character. One of or a combination of \code{"raw"}, \code{"normalized"}, \code{"all"}. Default \code{c("raw", "normalized")}. Should the PCA plots be done based on the raw or normalized data, respectively?
12 12
 #' @template forceRerun
... ...
@@ -525,6 +525,9 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
525 525
     if (!permutationCur %in% dataType2) {
526 526
       next
527 527
     }
528
+      
529
+    futile.logger::flog.info(paste0("\n Plotting for permutation ", permutationCur))
530
+      
528 531
     
529 532
     suffixFile = .getPermutationSuffixStr(permutationCur)
530 533
     
... ...
@@ -677,8 +680,15 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
677 680
   
678 681
   pageCounter = 1  
679 682
   
680
-  plots.l = list()
681
-  index = 1
683
+  if (!is.null(file)) {
684
+      .checkOutputFile(file)
685
+      grDevices::pdf(file, width = width, height = height)
686
+      futile.logger::flog.info(paste0("Plotting to file ", file))
687
+  } else {
688
+      futile.logger::flog.info(paste0("Plotting directly"))   
689
+  }
690
+  
691
+ 
682 692
   
683 693
   # Dont take all TF, some might be missing.
684 694
   connections_TF_peak = GRN@connections$TF_peaks[[as.character(perm)]]$connectionStats
... ...
@@ -692,10 +702,10 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
692 702
 
693 703
   pb <- progress::progress_bar$new(total = nTF)
694 704
   
695
-  levels_pos<-unique(as.character(cut(GRN@config$parameters$internal$stepsFDR, breaks = GRN@config$parameters$internal$stepsFDR, right = FALSE, include.lowest = TRUE )))
696
-  levels_neg<-unique(as.character(cut(GRN@config$parameters$internal$stepsFDR, breaks = rev(GRN@config$parameters$internal$stepsFDR), right = TRUE,  include.lowest = TRUE )))
705
+  steps = GRN@config$parameters$internal$stepsFDR
697 706
   
698
-
707
+  levels_pos<-unique(as.character(cut(steps, breaks = steps, right = FALSE, include.lowest = TRUE )))
708
+  levels_neg<-unique(as.character(cut(steps, breaks = rev(steps), right = TRUE, include.lowest = TRUE )))
699 709
   
700 710
   for (i in seq_len(nTF)) {
701 711
     pb$tick()
... ...
@@ -704,6 +714,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
704 714
     connections_TF_peakCur = dplyr::filter(connections_TF_peak, TF.name == TFCur)
705 715
     
706 716
     # Produce two FDR plots, coming from both directions
717
+    plotsCur.l = list()
707 718
     for (typeCur in c("pos","neg")) {
708 719
       
709 720
       if (typeCur == "pos") {
... ...
@@ -718,6 +729,8 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
718 729
         dplyr::mutate(TF_peak.r_bin = factor(TF_peak.r_bin, levels = levelsCur))  %>%
719 730
         reshape2::melt(id = c("TF_peak.r_bin", "TF_peak.connectionType", "n")) 
720 731
       
732
+      plotTitle = paste0(TFCur, ": Direction ", typeCur, ifelse(perm == 0, "", "(permuted)"))
733
+      
721 734
       if (!useGCCorrection) {
722 735
         
723 736
         g = suppressWarnings(connections_TF_peak.filt %>% 
... ...
@@ -725,7 +738,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
725 738
                                ggplot(aes(TF_peak.r_bin, value, color = .data$n, shape = variable)) +
726 739
                                geom_point(na.rm = TRUE) +
727 740
                                facet_wrap(~TF_peak.connectionType, ncol = 1) + 
728
-                               labs(x="Correlation bin", y="FDR TF-peak", title= paste0(TFCur, ": Direction ", typeCur)) +
741
+                               labs(x="Correlation bin", y="FDR TF-peak", title= plotTitle) +
729 742
                                theme_bw() +
730 743
                                theme(axis.text.x=element_text(angle=60, hjust=1, size= 8)) +
731 744
                                scale_x_discrete(drop=FALSE) + 
... ...
@@ -735,6 +748,8 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
735 748
                                                   values = c("TF_peak.fdr" = 1)) +
736 749
                                scale_color_viridis_c("No. of connections") 
737 750
         )
751
+        plotsCur.l[[typeCur]] = g
752
+        
738 753
         
739 754
         if (plotDetails) {
740 755
           
... ...
@@ -742,7 +757,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
742 757
             ggplot(aes(TF_peak.r_bin, log10(value+1), color = variable)) + 
743 758
             geom_point(size = 1) +
744 759
             facet_wrap(~TF_peak.connectionType, ncol = 1) + 
745
-            labs(x="Correlation bin", y="log10(Observed frequencies +1)", title= paste0(TFCur, ": Direction ", typeCur)) +
760
+            labs(x="Correlation bin", y="log10(Observed frequencies +1)", title= plotTitle) +
746 761
             theme_bw() +
747 762
             theme(axis.text.x=element_text(angle=60, hjust=1, size= 8)) +
748 763
             scale_x_discrete(drop=FALSE) + 
... ...
@@ -753,6 +768,8 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
753 768
                                labels = c("tpvalue" = "True positives\n(foreground, scaling ref.)\n", 
754 769
                                           "fpvalue" = "False positives\n(background, unscaled)\n", 
755 770
                                           "fpvalue_norm" = "False positives\n(background, scaled to ref.)"))
771
+          
772
+          plotsCur.l[[paste0(typeCur, "_details")]] = g
756 773
         }
757 774
         
758 775
         
... ...
@@ -764,7 +781,7 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
764 781
                                ggplot(aes(TF_peak.r_bin, value, color = .data$n, shape = variable, alpha = variable)) + 
765 782
                                geom_point(na.rm = TRUE) +
766 783
                                facet_wrap(~TF_peak.connectionType, ncol = 1) + 
767
-                               labs(x="Correlation bin", y="FDR TF-peak", title= paste0(TFCur, ": Direction ", typeCur)) +
784
+                               labs(x="Correlation bin", y="FDR TF-peak", title= plotTitle) +
768 785
                                theme_bw() +
769 786
                                theme(axis.text.x=element_text(angle=60, hjust=1, size= 8)) +
770 787
                                scale_x_discrete(drop=FALSE) + 
... ...
@@ -774,13 +791,15 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
774 791
                                                   values = c("TF_peak.fdr" = 3,"TF_peak.fdr_orig" = 1)) +
775 792
                                scale_color_viridis_c("No. of connections") + 
776 793
                                scale_alpha_discrete(range = c("TF_peak.fdr" = 0.5,"TF_peak.fdr_orig" = 1))  +
777
-                               guides(alpha = FALSE))
794
+                               guides(alpha = FALSE)
795
+        )
796
+        plotsCur.l[[typeCur]] = g
778 797
         
779 798
         if (plotDetails) {
780 799
           g2 = ggplot(connections_TF_peak.filt %>% dplyr::filter(grepl("value", variable)), aes(TF_peak.r_bin, log10(value+1), color = variable)) + 
781 800
             geom_point(size = 1) +
782 801
             facet_wrap(~TF_peak.connectionType, ncol = 1) + 
783
-            labs(x="Correlation bin", y="log10(Observed frequencies +1)", title= paste0(TFCur, ": Direction ", typeCur)) +
802
+            labs(x="Correlation bin", y="log10(Observed frequencies +1)", title= plotTitle) +
784 803
             theme_bw() +
785 804
             theme(axis.text.x=element_text(angle=60, hjust=1, size= 8)) +
786 805
             scale_x_discrete(drop=FALSE) + 
... ...
@@ -794,41 +813,36 @@ plotDiagnosticPlots_TFPeaks <- function(GRN,
794 813
                                           "fpvalue_norm_orig" = "False positives without GC\n(background, scaled to ref.)\n",
795 814
                                           "fpvalue" = "False positives with GC\n(background, unscaled)\n", 
796 815
                                           "fpvalue_norm" = "False positives with GC\n(background, scaled to ref.)"))
816
+          
817
+          plotsCur.l[[paste0(typeCur, "_details")]] = g
797 818
         }
798 819
         
799 820
       }
800 821
       
801
-      # We now created either just one plot g or g + g2 for "one" direction
802
-      if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) {
803
-          plots.l[[index]] = g
804
-          index = index + 1
805
-      }
806
-
807
-      
808
-      if (plotDetails) {
809
-          
810
-          if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) {
811
-              plots.l[[index]] = g2
812
-              index = index + 1
813
-          }
814
-
815
-          # Only here we already have a new page, as we produced 2 plots in this iteration alone
816
-          pageCounter = pageCounter + 1 
817
-       }
818
-
819
-      
820 822
     } # end for both directions
821 823
     
822
-    if (!plotDetails) {
824
+    # Lets create the plots
825
+    if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) {
826
+        plots_all = plotsCur.l$pos / plotsCur.l$neg
827
+        plot(plots_all)
828
+    }
829
+    pageCounter = pageCounter + 1 
830
+    
831
+    if (plotDetails) {
832
+        if (is.null(pages) | (!is.null(pages) && pageCounter %in% pages)) {
833
+            plots_all = plotsCur.l$pos_details / plotsCur.l$neg_details
834
+            plot(plots_all)
835
+        }
823 836
         pageCounter = pageCounter + 1 
824 837
     }
825
-
838
+    
839
+  
826 840
     
827 841
   } # end allTF
828 842
   
829 843
   .printWarningPageNumber(pages, pageCounter)
830
-  
831
-  .printMultipleGraphsPerPage(plots.l, nCol = 1, nRow = 2, pdfFile = file, height = height, width = width)
844
+ 
845
+  if (!is.null(file)) dev.off()
832 846
   
833 847
   .printExecutionTime(start)
834 848
 }
... ...
@@ -1712,7 +1726,7 @@ plot_stats_connectionSummary <- function(GRN, type = "heatmap",
1712 1726
                       column_names_gp = grid::gpar(fontsize = 10),
1713 1727
                       row_title = "TF-peak FDR",
1714 1728
                       cell_fun = function(j, i, x, y, width, height, fill) {
1715
-                          grid::grid.text(sprintf("%.0f", plotData.l[[permIndex]][i, j]), x, y, gp = grid::gpar(fontsize = 30))
1729
+                          grid::grid.text(sprintf("%.0f", plotData.l[[permIndex]][i, j]), x, y, gp = grid::gpar(fontsize = 20))
1716 1730
                       }
1717 1731
                   ) %>% plot()
1718 1732
                   
... ...
@@ -3240,55 +3254,69 @@ plotTFEnrichment <- function(GRN, rankType = "degree", n = NULL, TF.names = NULL
3240 3254
 
3241 3255
 ############### GRN visualization ###############
3242 3256
 
3243
-#' Visualize a filtered GRN (in development)
3257
+#' Visualize a filtered GRN. 
3244 3258
 #'
3245 3259
 #' @template GRN 
3246 3260
 #' @template permuted
3247
-#' @param file TODO
3248
-#' @param title TODO
3249
-#' @param maxRowsToPlot TODO
3250
-#' @param useDefaultMetadata TODO
3251
-#' @param vertice_color_TFs TODO
3252
-#' @param vertice_color_peaks TODO
3253
-#' @param vertice_color_genes TODO
3261
+#' @template outputFolder
3262
+#' @template basenameOutput
3254 3263
 #' @template plotAsPDF
3255 3264
 #' @template pdf_width
3256 3265
 #' @template pdf_height
3266
+#' @param title NULL or Character. Default NULL. Title to be assigned to the plot.
3267
+#' @param maxRowsToPlot Numeric. Default 500. Refers to the maximum number of connections to be plotted.
3268
+#' @param graph Character. Default \code{TF-gene}. One of: \code{TF-gene}, \code{TF-peak-gene}. Whether to plot a graph with links from TFs to peaks to gene, or the graph with the inferred TF to gene connections.
3269
+#' @param colorby Character. Default \code{type}. One of \code{type}, code \code{community}. Color the vertices by either type (TF/peak/gene) or community. See \code{\link{calculateCommunitiesStats}}
3270
+#' @param layered Boolean. Default \code{FALSE}. Display the network in a layered format where each layer corresponds to a node type (TF/peak/gene).
3271
+#' @param vertice_color_TFs Named list. Default \code{list(h = 10, c = 85, l = c(25, 95))}. The list must specify the color in hcl format (hue, chroma, luminence). See the \code{\link[colorspace]} package for more details and examples
3272
+#' @param vertice_color_peaks Named list. Default \code{list(h = 135, c = 45, l = c(35, 95))}.
3273
+#' @param vertice_color_genes Named list. Default \code{list(h = 260, c = 80, l = c(30, 90))}.
3274
+#' @param vertexLabel_cex Numeric. Default \code{0.4}. Font size (multiplication factor, device-dependent)
3275
+#' @param vertexLabel_dist Numeric. Default \code{0} vertex. Distance between the label and the vertex.
3257 3276
 #' @template forceRerun
3258
-
3259
-#' @return TODO
3260
-# #' @export
3261
-visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaultMetadata = TRUE,
3262
-                         graph = "TF-gene" , colorby = "type", layered = FALSE,
3263
-                         vertice_color_TFs = NULL, vertice_color_peaks = NULL, vertice_color_genes = NULL,
3264
-                         plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12,
3277
+#' @example 
3278
+#' GRN = visualizeGRN(GRN, maxRowsToPlot = 700, graph = "TF-peak-gene", colorby = "type")
3279
+#' @return the GRN object
3280
+#' @export
3281
+visualizeGRN <- function(GRN, outputFolder = NULL,  basenameOutput = NULL, plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12,
3282
+                         title = NULL, maxRowsToPlot = 500, graph = "TF-gene" , colorby = "type", layered = FALSE,
3283
+                         vertice_color_TFs = list(h = 10, c = 85, l = c(25, 95)), vertice_color_peaks = list(h = 135, c = 45, l = c(35, 95)), vertice_color_genes = list(h = 260, c = 80, l = c(30, 90)),
3284
+                         vertexLabel_cex = 0.4, vertexLabel_dist = 0,
3285
+                         
3265 3286
                          forceRerun = FALSE
3266 3287
 ) {
3267 3288
     
3289
+    
3268 3290
     start = Sys.time()
3269 3291
     GRN = .addFunctionLogToObject(GRN)
3270 3292
     
3271
-    #checkmate::assertFlag(plotAsPDF)
3293
+    checkmate::assertFlag(plotAsPDF)
3294
+    checkmate::assertNumeric(pdf_width, lower = 5, upper = 99)
3295
+    checkmate::assertNumeric(pdf_height, lower = 5, upper = 99)
3272 3296
     checkmate::assertNumeric(maxRowsToPlot)
3273
-    checkmate::assertFlag(useDefaultMetadata)
3274 3297
     checkmate::assertSubset(graph, c("TF-gene", "TF-peak-gene"))
3275 3298
     checkmate::assertSubset(colorby, c("type", "community"))
3276 3299
     checkmate::assertFlag(layered)
3277
-    checkmate::assertNumeric(pdf_width, lower = 5, upper = 99)
3278
-    checkmate::assertNumeric(pdf_height, lower = 5, upper = 99)
3279
-    
3280
-    
3281
-    futile.logger::flog.info(paste0("Plotting GRN network", dplyr::if_else(is.null(file), "", paste0(" to file ", file))))
3282
-    
3283
-    if (useDefaultMetadata) {
3284
-        metadata_visualization.l = getBasic_metadata_visualization(GRN)
3285
-        vertice_color_TFs   = list(metadata_visualization.l[["RNA_expression_TF"]],    "HOCOID",     "baseMean_log")
3286
-        vertice_color_genes = list(metadata_visualization.l[["RNA_expression_genes"]], "ENSEMBL_ID", "baseMean_log")
3287
-        vertice_color_peaks = list(metadata_visualization.l[["Peaks_accessibility"]],   "peakID",     "mean_log")
3288
-    }
3289
-    
3290
-    
3291
-    
3300
+    checkmate::assertList(vertice_color_TFs)
3301
+    checkmate::assertNames(names(vertice_color_TFs), must.include = c("h", "c", "l"), subset.of = c("h", "c", "l"))
3302
+    checkmate::assertList(vertice_color_peaks)
3303
+    checkmate::assertNames(names(vertice_color_peaks), must.include = c("h", "c", "l"), subset.of = c("h", "c", "l"))
3304
+    checkmate::assertList(vertice_color_genes)
3305
+    checkmate::assertNames(names(vertice_color_genes), must.include = c("h", "c", "l"), subset.of = c("h", "c", "l"))
3306
+    checkmate::assertNumeric(vertexLabel_cex)
3307
+    checkmate::assertNumeric(vertexLabel_dist)
3308
+    checkmate::assertFlag(forceRerun)
3309
+    
3310
+    
3311
+    outputFolder = .checkOutputFolder(GRN, outputFolder)
3312
+    
3313
+    # if (useDefaultMetadata) {
3314
+    #   metadata_visualization.l = getBasic_metadata_visualization(GRN)
3315
+    #   vertice_color_TFs   = list(metadata_visualization.l[["RNA_expression_TF"]],    "HOCOID",     "baseMean_log")
3316
+    #   vertice_color_genes = list(metadata_visualization.l[["RNA_expression_genes"]], "ENSEMBL_ID", "baseMean_log")
3317
+    #   vertice_color_peaks = list(metadata_visualization.l[["Peaks_accessibility"]],   "peakID",     "mean_log")
3318
+    # }
3319
+    # 
3292 3320
     #grn.merged = getGRNConnections(GRN, permuted = permuted, type = "all.filtered")
3293 3321
     # check that it's in sync with the @ graph
3294 3322
     if (graph == "TF-gene"){
... ...
@@ -3335,10 +3363,10 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul
3335 3363
     
3336 3364
     
3337 3365
     if (plotAsPDF) {
3338
-        grDevices::pdf(file = file, width = pdf_width, height = pdf_height)
3339
-        futile.logger::flog.info(paste0("Plotting to file ", file))
3366
+        futile.logger::flog.info(paste0("Plotting GRN network to ", outputFolder, dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_network"), basenameOutput),".pdf"))
3367
+        grDevices::pdf(file = paste0(outputFolder,"/", ifelse(is.null(basenameOutput), .getOutputFileName("plot_network"), basenameOutput),".pdf"),width = pdf_width, height = pdf_height )
3340 3368
     } else {
3341
-        futile.logger::flog.info(paste0("Plotting directly"))
3369
+        futile.logger::flog.info(paste0("Plotting GRN network"))
3342 3370
     }
3343 3371
     
3344 3372
     if (nRows > maxRowsToPlot) { 
... ...
@@ -3347,7 +3375,7 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul
3347 3375
         message = paste0(title, "\n\nPlotting omitted.\n\nThe number of rows in the GRN (", nRows, ") exceeds the maximum of ", maxRowsToPlot, ".\nSee the maxRowsToPlot parameter to increase the limit")
3348 3376
         text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red")
3349 3377
         
3350
-        if (!is.null(file)) {
3378
+        if (plotAsPDF) {
3351 3379
             grDevices::dev.off()
3352 3380
         }
3353 3381
         
... ...
@@ -3394,30 +3422,31 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul
3394 3422
         nBins_discard = 25
3395 3423
         nBins_real = nBins_orig - nBins_discard
3396 3424
         
3397
-        if (!is.null(vertice_color_TFs)) {
3398
-            color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = 10, c = 85, l = c(25, 95)))[(nBins_discard + 1):nBins_orig] # red
3399
-            colors_categories.l[["TF"]]  = c(color_gradient[1], color_gradient[nBins_real]) 
3400
-            colors_categories.l[["TF"]]  = color_gradient 
3401
-            symbols_categories.l[["TF"]] = c(15,NA,15)
3402
-        }
3425
+        #if (!is.null(vertice_color_TFs)) {
3426
+        color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = vertice_color_TFs[["h"]], c = vertice_color_TFs[["c"]], l = vertice_color_TFs[["l"]]))[(nBins_discard + 1):nBins_orig] # red
3427
+        colors_categories.l[["TF"]]  = c(color_gradient[1], color_gradient[nBins_real]) 
3428
+        colors_categories.l[["TF"]]  = color_gradient 
3429
+        symbols_categories.l[["TF"]] = c(15,NA,15)
3430
+        vertice_color_TFs   = append(list(metadata_visualization.l[["RNA_expression_TF"]],    "HOCOID",     "baseMean_log"), vertice_color_TFs)
3431
+        #}
3403 3432
         
3404 3433
         if(graph == "TF-peak-gene"){
3405
-            
3406
-            if (!is.null(vertice_color_peaks)) {
3407
-                color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = 135, c = 45, l = c(35, 95)))[(nBins_discard + 1):nBins_orig]  # green
3408
-                colors_categories.l[["PEAK"]] = c(color_gradient[1], color_gradient[nBins_real])
3409
-                colors_categories.l[["PEAK"]] = color_gradient
3410
-                symbols_categories.l[["PEAK"]] = c(21,NA,21)
3411
-            }
3412
-            
3434
+            #if (!is.null(vertice_color_peaks)) {
3435
+            color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = vertice_color_peaks[["h"]], c = vertice_color_peaks[["c"]], l = vertice_color_peaks[["l"]]))[(nBins_discard + 1):nBins_orig]  # green
3436
+            colors_categories.l[["PEAK"]] = c(color_gradient[1], color_gradient[nBins_real])
3437
+            colors_categories.l[["PEAK"]] = color_gradient
3438
+            symbols_categories.l[["PEAK"]] = c(21,NA,21)
3439
+            vertice_color_peaks = append(list(metadata_visualization.l[["Peaks_accessibility"]],   "peakID",     "mean_log"), vertice_color_peaks)
3440
+            # }
3413 3441
         }
3414 3442
         
3415
-        if (!is.null(vertice_color_genes)) {
3416
-            color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = 260, c = 80, l = c(30, 90)))[(nBins_discard + 1):nBins_orig] # blue
3417
-            colors_categories.l[["GENE"]] = c(color_gradient[1], color_gradient[nBins_real]) 
3418
-            colors_categories.l[["GENE"]] = color_gradient
3419
-            symbols_categories.l[["GENE"]] = c(21,NA,21)
3420
-        }
3443
+        #if (!is.null(vertice_color_genes)) {
3444
+        color_gradient = rev(colorspace::sequential_hcl(nBins_orig, h = vertice_color_genes[["h"]], c = vertice_color_genes[["c"]], l = vertice_color_genes[["l"]]))[(nBins_discard + 1):nBins_orig] # blue
3445
+        colors_categories.l[["GENE"]] = c(color_gradient[1], color_gradient[nBins_real]) 
3446
+        colors_categories.l[["GENE"]] = color_gradient
3447
+        symbols_categories.l[["GENE"]] = c(21,NA,21)
3448
+        vertice_color_genes = append(list(metadata_visualization.l[["RNA_expression_genes"]], "ENSEMBL_ID", "baseMean_log"), vertice_color_genes)
3449
+        #}
3421 3450
         
3422 3451
         ## VERTICES ##
3423 3452
         
... ...
@@ -3637,9 +3666,9 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul
3637 3666
             ncommunities = length(unique(GRN@graph$TF_gene$clusterGraph$membership))
3638 3667
             
3639 3668
             if (ncommunities >=8){
3640
-                community_colors = data.frame(community = names(sort(table(GRN@graph$TF_gene$clusterGraph$membership), decreasing = T)[1:ncommunities]),
3669
+                community_colors = data.frame(community = names(sort(table(GRN@graph$TF_gene$clusterGraph$membership), decreasing = T)[1:nCommunitiesMax]),
3641 3670
                                               color = rainbow(7))
3642
-                fillercolors = data.frame(community = 8:ncommunities, color = "847E89") # only color the x largest communities
3671
+                fillercolors = data.frame(community = nCommunitiesMax:ncommunities, color = "847E89") # only color the x largest communities
3643 3672
                 community_colors = rbind(community_colors, fillercolors)
3644 3673
                 
3645 3674
             }else{
... ...
@@ -3742,7 +3771,7 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul
3742 3771
         
3743 3772
         
3744 3773
         # Calling plot.new() might be necessary here
3745
-        if(is.null(file)){
3774
+        if(!plotAsPDF){
3746 3775
             plot.new()
3747 3776
         }
3748 3777
         par(mar=c(7,0,0,0) + 0.2)
... ...
@@ -3759,12 +3788,12 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul
3759 3788
             edge.width = igraph::E(net)$weight,
3760 3789
             vertex.label=igraph::V(net)$label,
3761 3790
             vertex.label.font=1, 
3762
-            vertex.label.cex = 0.3, 
3791
+            vertex.label.cex = vertexLabel_cex, 
3763 3792
             vertex.label.family="Helvetica", 
3764 3793
             vertex.label.color = "black",
3765 3794
             vertex.label.degree= -pi/2,
3766 3795
             vertex.label=igraph::V(net)$label,
3767
-            vertex.label.dist= 0,
3796
+            vertex.label.dist= vertexLabel_dist,
3768 3797
             vertex.shape = shape_vertex[igraph::V(net)$type],
3769 3798
             main = title
3770 3799
         )
... ...
@@ -3834,22 +3863,20 @@ visualizeGRN <- function(GRN, file, title = NULL, maxRowsToPlot = 500, useDefaul
3834 3863
         
3835 3864
     }
3836 3865
     
3837
-    if (!is.null(file)) {
3866
+    if (plotAsPDF) {
3838 3867
         grDevices::dev.off()
3839 3868
     }
3840 3869
     
3841 3870
     .printExecutionTime(start)
3842 3871
     
3872
+    GRN
3843 3873
 }
3844 3874
 
3845 3875
 
3846 3876
 
3847
-
3848
-
3849
-
3850 3877
 .verifyArgument_verticeType <- function(vertice_color_list) {
3851
-  
3852
-  checkmate::assertList(vertice_color_list, len = 3)
3853
-  checkmate::assertDataFrame(vertice_color_list[[1]])
3854
-  checkmate::assertSubset(c(vertice_color_list[[2]], vertice_color_list[[3]]), colnames(vertice_color_list[[1]]))
3878
+    
3879
+    checkmate::assertList(vertice_color_list, len = 6)
3880
+    checkmate::assertDataFrame(vertice_color_list[[1]])
3881
+    checkmate::assertSubset(c(vertice_color_list[[2]], vertice_color_list[[3]]), colnames(vertice_color_list[[1]]))
3855 3882
 }
... ...
@@ -8,7 +8,7 @@ plotDiagnosticPlots_peakGene(
8 8
   GRN,
9 9
   outputFolder = NULL,
10 10
   basenameOutput = NULL,
11
-  gene.types = list(c("all"), c("protein_coding"), c("protein_coding", "lincRNA")),
11
+  gene.types = list(c("protein_coding", "lincRNA")),
12 12
   useFiltered = FALSE,
13 13
   plotDetails = FALSE,
14 14
   plotPerTF = FALSE,
... ...
@@ -26,7 +26,7 @@ plotDiagnosticPlots_peakGene(
26 26
 
27 27
 \item{basenameOutput}{\code{NULL} or character. Default \code{NULL}. Basename of the output files that are produced. If set to \code{NULL}, a default basename is chosen. If a custom basename is specified, all output files will have the chosen basename as file prefix, be careful with not overwriting already existing files (if \code{forceRerun} is set to \code{TRUE})}
28 28
 
29
-\item{gene.types}{List of character vectors. Default list(c("all"), c("protein_coding"), c("protein_coding", "lincRNA")). Vectors of gene types to consider for the diagnostic plots. Multiple distinct combinations of gene types can be specified. For example, if set to \code{list(c("protein_coding", "lincRNA"), c("protein_coding"), c("all"))}, 3 distinct PDFs will be produced, one for each element of the list. The first file would only consider protein-coding and lincRNA genes, while the second plot only considers protein-coding ones. The special keyword "all" denotes all gene types found (usually, there are many gene types present, also more exotic and rare ones).}
29
+\item{gene.types}{List of character vectors. Default list(c("protein_coding", "lincRNA")). Vectors of gene types to consider for the diagnostic plots. Multiple distinct combinations of gene types can be specified. For example, if set to \code{list(c("protein_coding", "lincRNA"), c("protein_coding"), c("all"))}, 3 distinct PDFs will be produced, one for each element of the list. The first file would only consider protein-coding and lincRNA genes, while the second plot only considers protein-coding ones. The special keyword "all" denotes all gene types found (usually, there are many gene types present, also more exotic and rare ones).}
30 30
 
31 31
 \item{useFiltered}{Logical. TRUE or FALSE. Default FALSE. If set to \code{FALSE}, the diagnostic plots will be produced based on all peak-gene connections. This is the default and will usually be best to judge whether the background behaves as expected. If set to TRUE, the diagnostic plots will be produced based on the filtered set of connections. For this, the function \code{link{filterGRNAndConnectGenes}} must have been run before.}
32 32
 
... ...
@@ -25,7 +25,7 @@ plotPCA_all(
25 25
 
26 26
 \item{basenameOutput}{\code{NULL} or character. Default \code{NULL}. Basename of the output files that are produced. If set to \code{NULL}, a default basename is chosen. If a custom basename is specified, all output files will have the chosen basename as file prefix, be careful with not overwriting already existing files (if \code{forceRerun} is set to \code{TRUE})}
27 27
 
28
-\item{data}{Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data).}
28
+\item{data}{Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Default \code{c("rna", "peaks")}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data).}
29 29
 
30 30
 \item{topn}{Integer vector. Default \code{c(500,1000,5000)}. Number of top variable features to do PCA for. Can be a vector of different numbers (see default).}
31 31
 
... ...
@@ -2,56 +2,68 @@
2 2
 % Please edit documentation in R/plot.R
3 3
 \name{visualizeGRN}
4 4
 \alias{visualizeGRN}
5
-\title{Visualize a filtered GRN (in development)}
5
+\title{Visualize a filtered GRN.}
6 6
 \usage{
7 7
 visualizeGRN(
8 8
   GRN,
9
-  file,
9
+  outputFolder = NULL,
10
+  basenameOutput = NULL,
11
+  plotAsPDF = TRUE,
12
+  pdf_width = 12,
13
+  pdf_height = 12,
10 14
   title = NULL,
11 15
   maxRowsToPlot = 500,
12
-  useDefaultMetadata = TRUE,
13 16
   graph = "TF-gene",
14 17
   colorby = "type",
15 18
   layered = FALSE,
16
-  vertice_color_TFs = NULL,
17
-  vertice_color_peaks = NULL,
18
-  vertice_color_genes = NULL,
19
-  plotAsPDF = TRUE,
20
-  pdf_width = 12,
21
-  pdf_height = 12,
19
+  vertice_color_TFs = list(h = 10, c = 85, l = c(25, 95)),
20
+  vertice_color_peaks = list(h = 135, c = 45, l = c(35, 95)),
21
+  vertice_color_genes = list(h = 260, c = 80, l = c(30, 90)),
22
+  vertexLabel_cex = 0.4,
23
+  vertexLabel_dist = 0,
22 24
   forceRerun = FALSE
23 25
 )
24 26
 }
25 27
 \arguments{
26 28
 \item{GRN}{Object of class \code{\linkS4class{GRN}}}
27 29
 
28
-\item{file}{TODO}
30
+\item{outputFolder}{Character or \code{NULL}. Default \code{NULL}. If set to \code{NULL}, the default output folder as specified when initiating the object in \code{link{initializeGRN}} will be used. Otherwise, all output from this function will be put into the specified folder. We recommend specifying an absolute path.}
29 31
 
30
-\item{title}{TODO}
32
+\item{basenameOutput}{\code{NULL} or character. Default \code{NULL}. Basename of the output files that are produced. If set to \code{NULL}, a default basename is chosen. If a custom basename is specified, all output files will have the chosen basename as file prefix, be careful with not overwriting already existing files (if \code{forceRerun} is set to \code{TRUE})}
31 33
 
32
-\item{maxRowsToPlot}{TODO}
34
+\item{plotAsPDF}{\code{TRUE} or \code{FALSE}. Default \code{TRUE}.Should the plots be printed to a PDF file? If set to \code{TRUE}, a PDF file is generated, the name of which depends on the value of \code{basenameOutput}. If set to \code{FALSE}, all plots are printed to the currently active device. Note that most functions print more than one plot, which means you may only see the last plot depending on your active graphics device.}
33 35
 
34
-\item{useDefaultMetadata}{TODO}
36
+\item{pdf_width}{Number. Default 12. Width of the PDF, in cm.}
35 37
 
36
-\item{vertice_color_TFs}{TODO}
38
+\item{pdf_height}{Number. Default 12. Height of the PDF, in cm.}
37 39
 
38
-\item{vertice_color_peaks}{TODO}
40
+\item{title}{NULL or Character. Default NULL. Title to be assigned to the plot.}
39 41
 
40
-\item{vertice_color_genes}{TODO}
42
+\item{maxRowsToPlot}{Numeric. Default 500. Refers to the maximum number of connections to be plotted.}
41 43
 
42
-\item{plotAsPDF}{\code{TRUE} or \code{FALSE}. Default \code{TRUE}.Should the plots be printed to a PDF file? If set to \code{TRUE}, a PDF file is generated, the name of which depends on the value of \code{basenameOutput}. If set to \code{FALSE}, all plots are printed to the currently active device. Note that most functions print more than one plot, which means you may only see the last plot depending on your active graphics device.}
44
+\item{graph}{Character. Default \code{TF-gene}. One of: \code{TF-gene}, \code{TF-peak-gene}. Whether to plot a graph with links from TFs to peaks to gene, or the graph with the inferred TF to gene connections.}
43 45
 
44
-\item{pdf_width}{Number. Default 12. Width of the PDF, in cm.}
46
+\item{colorby}{Character. Default \code{type}. One of \code{type}, code \code{community}. Color the vertices by either type (TF/peak/gene) or community. See \code{\link{calculateCommunitiesStats}}}
45 47
 
46
-\item{pdf_height}{Number. Default 12. Height of the PDF, in cm.}
48
+\item{layered}{Boolean. Default \code{FALSE}. Display the network in a layered format where each layer corresponds to a node type (TF/peak/gene).}
49
+
50
+\item{vertice_color_TFs}{Named list. Default \code{list(h = 10, c = 85, l = c(25, 95))}. The list must specify the color in hcl format (hue, chroma, luminence). See the \code{\link[colorspace]} package for more details and examples}
51
+
52
+\item{vertice_color_peaks}{Named list. Default \code{list(h = 135, c = 45, l = c(35, 95))}.}
53
+
54
+\item{vertice_color_genes}{Named list. Default \code{list(h = 260, c = 80, l = c(30, 90))}.}
55
+
56
+\item{vertexLabel_cex}{Numeric. Default \code{0.4}. Font size (multiplication factor, device-dependent)}
57
+
58
+\item{vertexLabel_dist}{Numeric. Default \code{0} vertex. Distance between the label and the vertex.}
47 59
 
48 60
 \item{forceRerun}{\code{TRUE} or \code{FALSE}. Default \code{FALSE}. Force execution, even if the GRN object already contains the result. Overwrites the old results.}
49 61
 
50 62
 \item{permuted}{\code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should the permuted data be taken (\code{TRUE}) or the non-permuted, original one (\code{FALSE})?}
51 63
 }
52 64
 \value{
53
-TODO
65
+the GRN object
54 66
 }
55 67
 \description{
56
-Visualize a filtered GRN (in development)
68
+Visualize a filtered GRN.
57 69
 }
... ...
@@ -4,7 +4,7 @@ author: "Christian Arnold, Judith Zaugg, Rim Moussa"
4 4
 date: "`r BiocStyle::doc_date()`"
5 5
 package: "`r BiocStyle::pkg_ver('GRaNIE')`"
6 6
 abstract: >
7
-  This workflow vignette shows how to use the `GRaNIE` package in a real-world example. For this purpose, you will use real data that is soon available in the `GRaNIEData` package for a more complex analysis to illustrate most of its features. Importantly, you will also learn in detail how to work with a `GRaNIE` object and what its main functions and properties are. The vignette will be continuously updated whenever new functionality becomes available or when we receive user feedback.
7
+  This workflow vignette shows how to use the `GRaNIE` package in a real-world example. For this purpose, you will use real data for a more complex analysis to illustrate most of its features. Importantly, you will also learn in detail how to work with a `GRaNIE` object and what its main functions and properties are. The vignette will be continuously updated whenever new functionality becomes available or when we receive user feedback.
8 8
 vignette: >
9 9
   %\VignetteIndexEntry{Workflow example}
10 10
   %\VignetteEngine{knitr::rmarkdown}
... ...
@@ -53,10 +53,25 @@ pre[class] {
53 53
 }
54 54
 ```
55 55
 
56
+# Example data
57
+
58
+The data is taken from [here](https://zenodo.org/record/1188300#.X370PXUzaSN) [1], has been processed to meet the requirements of the `GRaNIE` package and consists of the following files:
59
+
60
+* ATAC-seq peaks, raw counts (originally around 75,000, genome-wide, here filtered to around 60,500)
61
+* RNA-Seq data, raw counts (originally for around 35,000 genes, here filtered to around 19,000)
62
+* sample metadata with additional sample-specific information
63
+
64
+In general, the dataset is from human macrophages (both naive and IFNg primed) of healthy individuals and various stimulations / infections (naive vs primed and infected with SL1344 vs not), with 4 groups in total: control/infected(SL1344) and naive/primed(IFNg)
65
+
66
+
67
+Furthermore, the example dataset is accompanied by the following files:
68
+
69
+* genome-wide transcription factor binding site predictions for 75 selected TFs, along with a translation table to link TF names to their corresponding Ensembl IDs. For each TF, a gzipped BED file has been created with predicted TF binding sites. The files have been generated with `PWMScan` and the `HOCOMOCO` database, see [2] for details.
70
+
56 71
 # Example Workflow
57 72
 <a name="section1"></a>
58 73
 
59
-In the following example, you will use data from the `GRaNIEData` package (soon available) to construct a *eGRN* from ATAC-seq, RNA-seq data as well transcription factor data.
74
+In the following example, you will use the example data to construct a *eGRN* from ATAC-seq, RNA-seq data as well transcription factor data.
60 75
 
61 76
 
62 77
 ```{r <knitr, echo=FALSE, message=FALSE, results="hide", class.output="scroll-200"}
... ...
@@ -75,14 +90,13 @@ For reasons of brevity, we omit the output of this code chunk.
75 90
 
76 91
 ```{r loadLibaries2, echo=TRUE, message=FALSE, results="hide", warning=FALSE, class.output="scroll-200"}
77 92
 library(tidyverse)
78
-#library(GRaNIEData)
79 93
 library(GRaNIE)
80 94
 ```
81 95
 
82 96
 
83 97
 ## General notes
84 98
 
85
-Each of the `GRaNIE` functions we mention here in this Vignette comes with sensible default parameters that we found to work well for most of the datasets we tested it with so far.  However, **always check the validity and usefulness of the parameters before starting an analysis** to avoid unreasonable results.
99
+Each of the `GRaNIE` functions we mention here in this Vignette comes with sensible default parameters that we found to work well for most of the datasets we tested it with so far. For the purpose of this Vignette, however, and the resulting running times, we here try to provide a good compromise between biological necessity and computational efficiacy.  However, **always check the validity and usefulness of the parameters before starting an analysis** to avoid unreasonable results.
86 100
 
87 101
 ## Reading the data required for the `GRaNIE` package
88 102
 
... ...
@@ -101,8 +115,8 @@ So, let's import the enhancer and RNA-seq data as a data frame as well as some s
101 115
 ```{r importData, echo=TRUE, include=TRUE, class.output="scroll-200"}
102 116
 
103 117
 # We load the example data directly from the web:
104
-file_peaks = "https://www.embl.de/download/zaugg/GRaNIE/peaks.tsv.gz"
105
-file_RNA   = "https://www.embl.de/download/zaugg/GRaNIE/rna.tsv.gz"
118
+file_peaks = "https://www.embl.de/download/zaugg/GRaNIE/countsATAC.filtered.tsv.gz"
119
+file_RNA   = "https://www.embl.de/download/zaugg/GRaNIE/countsRNA.filtered.tsv.gz"
106 120
 file_sampleMetadata = "https://www.embl.de/download/zaugg/GRaNIE/sampleMetadata.tsv.gz"
107 121
 
108 122
 countsRNA.df      = read_tsv(file_RNA, col_types = cols())
... ...
@@ -119,7 +133,6 @@ sampleMetadata.df
119 133
 idColumn_peaks = "peakID"
120 134
 idColumn_RNA = "ENSEMBL"
121 135
 
122
-
123 136
 ```
124 137
 
125 138
 While we recommend raw counts for both enhancers and RNA-Seq as input and offer several normalization choices in the pipeline, it is also possible to provide pre-normalized data. Note that the normalization method may have a large influence on the resulting *eGRN* network, so make sure the choice of normalization is reasonable. For more details, see the next sections.
... ...
@@ -145,7 +158,7 @@ objectMetadata.l = list(name                = paste0("Macrophages_infected_prime
145 158
                         file_sampleMetadata = file_sampleMetadata,
146 159
                         genomeAssembly      = genomeAssembly)
147 160
 
148
-dir_output = "output"
161
+dir_output = "."
149 162
 
150 163
 GRN = initializeGRN(objectMetadata = objectMetadata.l,
151 164
                     outputFolder = dir_output,
... ...
@@ -190,34 +203,34 @@ Note that while this step is recommended to do, it is fully optional from a work
190 203
 
191 204
 
192 205
 ```{r runPCA, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200", collapse=FALSE, results="hold"}
193
-GRN = plotPCA_all(GRN, data = c("rna"), topn = 500, type = "raw", plotAsPDF = FALSE, pages = c(3,4), forceRerun = TRUE)
206
+GRN = plotPCA_all(GRN, data = c("rna"), topn = 500, type = "raw", plotAsPDF = FALSE, pages = c(2,3,14), forceRerun = TRUE)
194 207
 ```
195 208
 
196
-Depending on the parameters, multiple output plots may be produced, with up to two files for each of the specified `data` modalities (that is, RNA-Seq counts, as specified with `rna` here, as well as the peak counts, `peaks`, not done here for reasons of brevity). For each of them, PCA plots can be produced for both `raw` and `normalized` data (here: only `raw`). With `raw`, we here denote the original counts as given as input with the `addData()` function, irrespective of whether this was already pre-normalized or not. For more details, see the [Package Details Vignette](packageDetails.html). The `topn` argument specifies the number of top variable features to do PCA for - here 500.
209
+Depending on the parameters, multiple output files (and plots) may be produced, with up to two files for each of the specified `data` modalities (that is, RNA-Seq counts, as specified with `rna` here, as well as the peak counts, `peaks`, not done here for reasons of brevity). For each of them, PCA plots can be produced for both `raw` and `normalized` data (here: only `raw`). With `raw`, we here denote the original counts as given as input with the `addData()` function, irrespective of whether this was already pre-normalized or not. The `topn` argument specifies the number of top variable features to do PCA for - here 500.
197 210
 
198
-We do not show any of the PCA plots here for reasons of brevity, but make sure to examine these plots closely! For all details, which plota are produced and further comments on how to understand and interpret them, see the [Package Details Vignette](packageDetails.html).
211
+There are more plots that are generated, make sure to examine these plots closely! For all details, which plots are produced and further comments on how to understand and interpret them, see the [Package Website](https://grp-zaugg.embl-community.io/GRaNIE/).
199 212
 
200 213
 
201 214
 ## Add TFs and TFBS and overlap with peak
202 215
 
203
-Now it is time to add data for TFs and predicted TF binding sites (TFBS)! Our `GRaNIE` package requires pre-computed TFBS that need to be in a specific format (see the [Package Details Vignette](packageDetails.html) for details). In brief, a 6-column bed file must be present for each TF, with a specific file name that starts with the name of the TF, an arbitrary and optional suffix (here: "_TFBS") and a particular file ending (supported are `bed` or `bed.gz`; here, we specify the latter). All these files must be located in a particular folder that the `addTFBS()` functions then searches in order to identify those files that match the specified patterns. We provide example TFBS for the 3 genome assemblies we support, see the comment below and the [Package Details Vignette](packageDetails.html) for details. After setting this up, we are ready to overlap the TFBS and the peaks by calling the function `overlapPeaksAndTFBS()`.
216
+Now it is time to add data for TFs and predicted TF binding sites (TFBS)! Our `GRaNIE` package requires pre-computed TFBS that need to be in a specific format. In brief, a 6-column bed file must be present for each TF, with a specific file name that starts with the name of the TF, an arbitrary and optional suffix (here: `_TFBS`) and a particular file ending (supported are `bed` or `bed.gz`; here, we specify the latter). All these files must be located in a particular folder that the `addTFBS()` functions then searches in order to identify those files that match the specified patterns. We provide example TFBS for the 3 genome assemblies we support. After setting this up, we are ready to overlap the TFBS and the peaks by calling the function `overlapPeaksAndTFBS()`.
204 217
 
205 218
 For more parameter details, see the R help (`?addTFBS` and `?overlapPeaksAndTFBS`). 
206 219
 
207 220
 ```{r addTFBS, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200", results='hold'}
208 221
 
209
-folder_TFBS_first50 = "TFBS_selected"
222
+folder_TFBS_first50 = "https://www.embl.de/download/zaugg/GRaNIE/TFBS_selected.zip"
210 223
 # Download the zip of all TFBS files. Takes too long here, not executed therefore
211
-# filename <- "TFBS_selected.zip"
212
-# download.file("https://www.embl.de/download/zaugg/GRaNIE/TFBS_selected.zip", file.path(filename), quiet = FALSE)
213
-# unzip(file.path(filename), overwrite = TRUE, exdir = file.path("TFBS_selected"))
224
+# download.file(folder_TFBS_first50, file.path("TFBS_selected.zip"), quiet = FALSE)
225
+# unzip(file.path("TFBS_selected.zip"), overwrite = TRUE)
226
+# motifFolder = tools::file_path_as_absolute("TFBS_selected")
214 227
 
215
-GRN = addTFBS(GRN, motifFolder = folder_TFBS_first50, TFs = "all", filesTFBSPattern = "_TFBS", fileEnding = ".bed.gz", forceRerun = TRUE)
228
+GRN = addTFBS(GRN, motifFolder = motifFolder, TFs = "all", filesTFBSPattern = "_TFBS", fileEnding = ".bed.gz", forceRerun = TRUE)
216 229
 
217 230
 GRN = overlapPeaksAndTFBS(GRN, nCores = 1, forceRerun = TRUE)
218 231
 ```
219 232
 
220
-We see from the output (omitted here for brevity) that 75 TFs have been found in the specified input folder, and the number of TFBS that overlap our peaks for each of them. We now successfully added our TFs and TFBS to the `GRaNIE` object.
233
+We see from the output (omitted here for brevity) that 6 TFs have been found in the specified input folder, and the number of TFBS that overlap our peaks for each of them. We successfully added our TFs and TFBS to the `GRaNIE` object"
221 234
 
222 235
 
223 236
 
... ...
@@ -247,7 +260,7 @@ We can see from the output that no peaks have been filtered due to their size an
247 260
 
248 261
 ## Add TF-enhancer connections
249 262
 
250
-We now have all necessary data in the object to start constructing our network. As explained in the [Package Details Vignette](packageDetails.html), we currently support two types of links for our `GRaNIE` approach:
263
+We now have all necessary data in the object to start constructing our network. As explained elsewhere, we currently support two types of links for our `GRaNIE` approach:
251 264
 
252 265
 1. TF - enhancer
253 266
 2. enhancer - gene
... ...
@@ -260,7 +273,7 @@ In addition to creating TF-enhancer links based on TF expression, we can also co
260 273
 
261 274
 Lastly, we offer a so called GC-correction that uses a GC-matching background to compare it with the foreground instead of using the full background as comparison. We are still investigating the plausibility and effects of this and therefore mark this feature as experimental as of now.
262 275
 
263
-This function may run a while, and each time-consuming step has a built-in progress bar so the remaining time can be estimated. Note that the TF-enhancer links are constructed for both the original, non-permuted data (in the corresponding output plots that are produced, this is labeled as `original`) and permuted data (`permuted`). 
276
+Note that the TF-enhancer links are constructed for both the original, non-permuted data (in the corresponding output plots that are produced, this is labeled as `original`) and permuted data (`permuted`). 
264 277
 For more parameter options and parameter details, see the R help (`?addConnections_TF_peak`). 
265 278
 
266 279
 ```{r addTFPeakConnections, echo=TRUE, include=TRUE, eval = TRUE, class.output="scroll-200", results='hold'}
... ...
@@ -268,40 +281,41 @@ GRN = addConnections_TF_peak(GRN, plotDiagnosticPlots = FALSE,
268 281
                                      connectionTypes = c("expression"),
269 282
                                      corMethod = "pearson", forceRerun = TRUE)
270 283
 ```
271
-From the output, we see that a total of 65 TFs have RNA-seq data available and consequently will be included and correlated with the enhancer accessibility.
284
+From the output, we see that all of the 6 TFs also have RNA-seq data available and consequently will be included and correlated with the enhancer accessibility.
272 285
 
273 286
 ## Quality control 2: Diagnostic plots for TF-enhancer connections
274 287
 
275
-After adding the TF-enhancer links to our `GRaNIE` object, let's look at some diagnostic plots. Depending on the user parameters, the plots are either directly plotted to the currently active graphics device or to PDF files as specified in the object or via the function parameters. If plotted to a PDF, within the specified or default output folder (when initializing the `GRaNIE` object) should contain two new files that are named `TF_peak.fdrCurves_original.pdf` and `TF_peak.fdrCurves_permuted.pdf`, for example. For reasons of brevity and organization, we fully describe their interpretation and meaning in detail in the [Package Details Vignette](packageDetails.html#output_TF_peak) and not here, however.
288
+After adding the TF-enhancer links to our `GRaNIE` object, let's look at some diagnostic plots. Depending on the user parameters, the plots are either directly plotted to the currently active graphics device or to PDF files as specified in the object or via the function parameters. If plotted to a PDF, within the specified or default output folder (when initializing the `GRaNIE` object) should contain two new files that are named `TF_peak.fdrCurves_original.pdf` and `TF_peak.fdrCurves_permuted.pdf`, for example. 
289
+
290
+This function may run a while, and each time-consuming step has a built-in progress bar for the plot-related parts so the remaining time can be estimated. 
276 291
 
277
-In summary, TF-enhancer diagnostic plots are available for each TF, and each page summarizes the QC for each TF in two plots:  
292
+For reasons of brevity and organization, we fully describe their interpretation and meaning in detail elsewhere, however. In summary, TF-enhancer diagnostic plots are available for each TF, and each page summarizes the QC for each TF in two plots:  
278 293
 
279 294
 
280 295
 ```{r, echo=FALSE}
281
-GRN = plotDiagnosticPlots_TFPeaks(GRN, dataType = c("real", "permuted"), plotAsPDF = FALSE, pages = c(6))
296
+GRN = plotDiagnosticPlots_TFPeaks(GRN, dataType = c("real", "permuted"), plotAsPDF = FALSE, pages = 4)
282 297
 ```
283 298
 
284
-TODO: correct
285
-For this example, we can see a quite typical case: the TF-enhancer FDR for the various *ETV5.0.C* - enhancer pairs are above 0.2 for the wide majority correlation bins in both directions (that is, positive and negative), while a few bins towards more extreme correlation bins have a low FDR. The former indicate little statistical signal and confidence, while the latter are those connections we are looking for! Typically, however, only  few connections are in the more extreme bins, as indicated by the color. Here, correlation bin refers to the correlation of a particular *ETV5.0.C* - enhancer pair that has been discretized accordingly (i.e., a correlation of 0.07 would go into (0.05-0.10] correlation bin). Usually, depending on the mode of action of a TF, none or either one of the two directions may show a low FDR in particular areas of the plots, often close to more extreme correlation bins, but rarely both. For a better judgement and interpretation, we can also check how this looks like for permuted data:
299
+For this example, we can see a quite typical case: the TF-enhancer FDR for the various *EGR1.0.A* - enhancer pairs are above 0.2 for the wide majority correlation bins in both directions (that is, positive and negative), while a few bins towards more extreme correlation bins have a low FDR. The former indicate little statistical signal and confidence, while the latter are those connections we are looking for! Typically, however, only  few connections are in the more extreme bins, as indicated by the color. Here, correlation bin refers to the correlation of a particular *EGR1.0.A* - enhancer pair that has been discretized accordingly (i.e., a correlation of 0.07 would go into (0.05-0.10] correlation bin). Usually, depending on the mode of action of a TF, none or either one of the two directions may show a low FDR in particular areas of the plots, often close to more extreme correlation bins, but rarely both. For a better judgement and interpretation, we can also check how this looks like for permuted data:
286 300
 
287 301
 
288 302
 We see that for permuted data, there is much less significant bins, as expected.
289 303
 
290
-Here, in summary, a few positively correlated *ETV5.0.C* - enhancer pairs (with a correlation of at least 0.65 or so) are statistically significant and may be retained for the final *eGRN* network (if corresponding genes connecting to the respective enhancers are found).
304
+Here, in summary, a few positively correlated *EGR1.0.A* - enhancer pairs (with a correlation of above 0.5 or so) are statistically significant and may be retained for the final *eGRN* network (if corresponding genes connecting to the respective enhancers are found).
291 305
 
292 306
 
293 307
 ## Run the AR classification and QC (optional) {#output_AR}
294 308
 
295 309
 Transcription factors (TFs) regulate many cellular processes and can therefore serve as readouts of the signaling and regulatory state. Yet for many TFs, the mode of action—repressing or activating transcription of target genes—is unclear. In analogy to our *diffTF* approach that we recently published to calculate differential TF activity,the classification of TFs into putative transcriptional activators or repressors can also be done from within the `GRaNIE` framework in an identical fashion. This can be achieved with the function `AR_classification_wrapper()`.  
296 310
 
297
-**Note that this step is fully optional and can be skipped. The output of the function is not used for subsequent steps.**. To keep the memory footprint of the `GRaNIE` object low, we recommend to keep the function parameter default `deleteIntermediateData = TRUE`. Here, we specify to put all plots within `output/plots`. However, we do not actually run the code here.
311
+**Note that this step is fully optional and can be skipped. The output of the function is not used for subsequent steps.**. To keep the memory footprint of the `GRaNIE` object low, we recommend to keep the function parameter default `deleteIntermediateData = TRUE`. Here, we specify to put all plots within `output/plots`. However, for reasons of brevity, we do not actually run the code here.
298 312
 
299 313
 
300 314
 ```{r runARClassification, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200"}
301 315
 GRN = AR_classification_wrapper(GRN, significanceThreshold_Wilcoxon = 0.05,
302 316
                                 outputFolder = "output/plots", 
303 317
                                 plot_minNoTFBS_heatmap = 100, plotDiagnosticPlots = TRUE,
304
-                                deleteIntermediateData = TRUE, forceRerun = TRUE)
318
+                                forceRerun = TRUE)
305 319
 ```
306 320
 The classification runs for both real and permuted data, as before. The contents of these plots are identical to and uses in fact practically the same code as our `diffTF` software, and we therefore do not include them here. We refer to the following links for more details as well as the [Package Details Vignette](packageDetails.html#output_AR):
307 321
 
... ...
@@ -340,7 +354,7 @@ GRN = addConnections_peak_gene(GRN,
340 354
                                nCores = 1, plotDiagnosticPlots = FALSE, plotGeneTypes = list(c("all")), forceRerun = TRUE)
341 355
 ```
342 356
 
343
-We see from the output that almost 38,000 enhancer-gene links have been identified that match our parameters. From these 38,000, however, only around 16,351 actually had corresponding RNA-seq data available, while RNA-seq data was missing or has been filtered for the other. This is a rather typical case, as not all known and annotated genes are included in the RNA-seq data in the first place. Similar to before, the correlations have also been performed for the permuted data.
357
+We see from the output that almost 38,000 enhancer-gene links have been identified that match our parameters. However, only around 16,351 actually had corresponding RNA-seq data available, while RNA-seq data was missing or has been filtered for the other. This is a rather typical case, as not all known and annotated genes are included in the RNA-seq data in the first place. Similar to before, the correlations have also been performed for the permuted data.
344 358
 
345 359
 
346 360
 ## Quality control 3: Diagnostic plots for enhancer-gene connections
... ...
@@ -406,9 +420,15 @@ GRN_connections.all
406 420
 The table contains many columns, and the prefix of each column name indicates the part of the *eGRN* network that the column refers to (e.g., TFs, TF-enhancers, enhancers, enhancer-genes or genes, or TF-gene if the function `add_TF_gene_correlation()` has been run before). Data are stored in a format that minimizes the memory footprint (e.g., each character column is stored as a factor). This table can now be used for any downstream analysis, as it is just a normal data frame.
407 421
 
408 422
 
409
-## Visualize the filtered *eGRN* connections
423
+## Visualize the filtered *eGRN*
424
+
425
+The `GRaNIE` package also offers a function to visualize a filtered *eGRN* network! It is very easy to invoke, but provides many options to customize the output and the way the graph is drawn. We recommend to explore the options in the R help (`?getGRNConnections`).
426
+
427
+
428
+```{r visualizeGRN, echo=TRUE, include=TRUE, eval = TRUE, class.output="scroll-200"}
429
+GRN = visualizeGRN(GRN)
430
+```
410 431
 
411
-The `GRaNIE` package will soon also offer some rudimentary functions to visualize a filtered *eGRN* network. Stay tuned! Meanwhile, you can use the `igraph` package to construct a graph out of the filtered TF-enhancer-gene connection table (see above).
412 432
 
413 433
 ## Generate a connection summary for filtered connections
414 434
 
... ...
@@ -416,10 +436,10 @@ It is often useful to get a grasp of the general connectivity of a network and t
416 436
 
417 437
 ```{r connectionSummary, echo=TRUE, include=TRUE, eval = TRUE, class.output="scroll-200"}
418 438
 GRN = generateStatsSummary(GRN,
419
-                           TF_peak.fdr = c(0.01, 0.05, 0.1, 0.2), 
439
+                           TF_peak.fdr = c(0.05, 0.1, 0.2), 
420 440
                            TF_peak.connectionTypes = "all",
421 441
                            peak_gene.p_raw = NULL,
422
-                           peak_gene.fdr = c(0.01, 0.05, 0.1, 0.2),
442
+                           peak_gene.fdr = c(0.1, 0.2),
423 443
                            peak_gene.r_range = c(0,1),
424 444
                            allowMissingGenes = c(FALSE, TRUE),
425 445
                            allowMissingTFs = c(FALSE),