Browse code

New version

Christian Arnold authored on 12/10/2022 22:47:16
Showing15 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: GRaNIE
2 2
 Title: GRaNIE: Reconstruction cell type specific gene regulatory networks including enhancers using chromatin accessibility and RNA-seq data
3
-Version: 1.1.17
3
+Version: 1.1.19
4 4
 Encoding: UTF-8
5 5
 Authors@R: c(person("Christian", "Arnold", email =
6 6
         "chrarnold@web.de", role = c("cre","aut")),
... ...
@@ -1,4 +1,15 @@
1
+# GRaNIE 1.1.14 to 1.1.19 (2022-12-13)
1 2
 
3
+## Major changes
4
+- major object changes and optimizations, particularly related to storing the count matrices in an optimized and simpler format. In short, the count matrices are now stored either as normal or sparse matrices, depending on the amount of zeros present. In addition, only the counts after normalization are saved, the raw counts before applying normalization are not stored anymore. If no normalization is wished by the user, as before, the "normalized" counts are equal to the raw counts. `GRaNIE` is now more readily applicable for larger analyses and single-cell analysis even though we just started actively optimizing for it, so we cannot yet recommend applying our framework in a single-cell manner. Older GRN objects are automatically changed internally when executing the major functions upon the first invocation.
5
+- various Documentation and R help updates
6
+- the function `generateStatsSummary` now doesnt alter the stored filtered connections in the object anymore. This makes its usage more intuitive and it can be used anywhere in the workflow.
7
+- removed redundant `biomaRt` calls in the code. This saves time and makes the code less vulnerable to timeout issues caused by remote services
8
+- due to the changes described above, the function `plotPCA_all` now can only plot the normalized counts and not the raw counts anymore (except when no normalization is wanted)
9
+- the GO enrichments are now also storing, for each GO term, the ENSEMBL IDs of the genes that were found in the foreground. This facilitates further exploration of the enrichment results.
10
+
11
+## Minor changes
12
+- many small changes in the code
2 13
 
3 14
 # GRaNIE 1.1.12 and 1.1.13 (2022-09-13)
4 15
 
... ...
@@ -9,27 +9,26 @@
9 9
 #' @section Accessors:
10 10
 #' In the following code snippets, \code{GRN} is a \code{\linkS4class{GRN}} object.
11 11
 #' 
12
-#'# Get general annotation of a GRaNIE object
12
+#'# Get general annotation of a GRN object from the GRaNIE package
13 13
 #' 
14
-#' \code{nPeaks(GRN))} and  \code{nGenes(GRN))}: Retrieve the number of peaks and genes, respectively, that have been added to the object (both before and after filtering)
14
+#' \code{nPeaks(GRN))}, \code{nTFs(GRN))} and \code{nGenes(GRN))}: Retrieve the number of peaks, TFs and genes, respectively, that have been added to the object (both before and after filtering)
15 15
 #'
16 16
 #' @slot data Currently stores 4 different types of data:\cr 
17 17
 #' \itemize{
18 18
 #' \item \code{peaks}:
19 19
 #'  \itemize{
20
-#'  \item \code{counts_norm}:
21
-#'  \item \code{counts_orig}:
22
-#'  \item \code{consensusPeaks}:
20
+#'  \item \code{counts}:
21
+#'  \item \code{counts_metadata}:
23 22
 #'  }
24 23
 #' \item \code{RNA}: 
25 24
 #'  \itemize{
26
-#'  \item \code{counts_norm.l}:
27
-#'  \item \code{counts_orig}:
25
+#'  \item \code{counts}:
26
+#'  \item \code{counts_metadata}:
27
+#'  \item \code{counts_permuted_index}:
28 28
 #'  }
29
-#' \item \code{metadata}: 
30 29
 #' \item \code{TFs}: 
31 30
 #'  \itemize{
32
-#'  \item \code{translationTable}:
31
+#'  \item \code{TF_activity}:
33 32
 #'  \item \code{TF_peak_overlap}:
34 33
 #'  \item \code{classification}:
35 34
 #'  }
... ...
@@ -43,9 +42,8 @@
43 42
 #' 
44 43
 #' @slot stats Stores statistical and summary information for a \code{GRN} network. Currently, connection details are stored here.
45 44
 #' 
46
-#' @slot visualization Stores visualization results, currently always empty. Feature in development.
47
-#' @slot isDev Flag whether this is an object from the development version of the package
48
-
45
+#' @slot graph Stores the eGRN graph related information and data structures
46
+#' 
49 47
 #' @keywords GRN-class, GRN
50 48
 #' @name GRN-class
51 49
 #' @aliases GRN-class
... ...
@@ -56,9 +54,8 @@ setClass("GRN", representation = representation(
56 54
   connections   = "list",
57 55
   data          = "list",
58 56
   stats         = "list",
59
-  visualization = "list",
60 57
   graph         = "list",
61
-  isDev         = "logical"
58
+  visualization = "list"
62 59
   )
63 60
 )
64 61
 
... ...
@@ -116,7 +113,7 @@ setMethod("show",
116 113
             #nGenes       = nGenes(GRN)
117 114
             
118 115
             cat("Data summary:\n")
119
-            if (!is.null(GRN@data$peaks$consensusPeaks)) {
116
+            if (!is.null(GRN@data$peaks$counts_metadata)) {
120 117
               nPeaks_filt  = nPeaks(GRN, filter = TRUE)
121 118
               nPeaks_all   = nPeaks(GRN, filter = FALSE)
122 119
               cat(" # peaks (filtered, all): ", nPeaks_filt, ", ",  nPeaks_all, "\n", sep = "")
... ...
@@ -124,25 +121,20 @@ setMethod("show",
124 121
               cat (" # peaks: No peak data found.\n")
125 122
             }
126 123
             
127
-            if (!is.null(GRN@data$RNA$counts_orig)) {
124
+            if (!is.null(GRN@data$RNA$counts_metadata)) {
128 125
                
129 126
               nGenes_filt = nGenes(GRN, filter = TRUE)
130 127
               nGenes_all  = nGenes(GRN, filter = FALSE)
131
-              
132
-              if (checkmate::testClass(GRN@data$RNA$counts_orig, "DESeqDataSet")) {
133
-                
134
-              } else {
135
-                
136
-              }
137
-             
128
+
138 129
               cat(" # genes (filtered, all): ", nGenes_filt, ", ",  nGenes_all, "\n", sep = "")
130
+              
139 131
             } else {
140 132
               cat (" # genes: No RNA-seq data found.\n")
141 133
             }
142 134
             
143
-            if (!is.null(GRN@data$RNA$counts_orig) & !is.null(GRN@data$peaks$consensusPeaks)) {
135
+            if (!is.null(GRN@data$RNA$counts_metadata) & !is.null(GRN@data$peaks$counts_metadata)) {
144 136
                 
145
-                cat(" # shared samples: ", nrow(GRN@data$metadata), "\n", sep = "")
137
+                cat(" # shared samples: ", length(GRN@config$sharedSamples), "\n", sep = "")
146 138
             } 
147 139
             
148 140
             if (!is.null(GRN@annotation$TFs)) {
... ...
@@ -223,14 +215,14 @@ setMethod("show",
223 215
                 }
224 216
                 
225 217
                 cat(" Enrichment (TF-gene):\n")
226
-                if (!is.null(GRN@stats$Enrichment$byTF)) {
218
+                if (!is.null(GRN@stats$Enrichment$general)) {
227 219
                     ontologies = names(GRN@stats$Enrichment$general)
228 220
                     cat("  Overall network: ", paste0(ontologies, collapse = " & "), "\n",sep = "")
229 221
                 } else {
230 222
                     cat("  Overall network: no enrichment results found\n")
231 223
                 }
232 224
                 
233
-                if (!is.null(GRN@stats$Enrichment$byTF)) {
225
+                if (!is.null(GRN@stats$Enrichment$byCommunity)) {
234 226
                     ontologies = names(GRN@stats$Enrichment$byCommunity[[1]])
235 227
                     cat("  Communities: ", paste0(ontologies, collapse = " & "), "\n",sep = "")
236 228
                 } else {
... ...
@@ -274,14 +266,14 @@ nPeaks <- function(GRN, filter = TRUE) {
274 266
   checkmate::assertClass(GRN, "GRN")
275 267
   checkmate::assertFlag(filter)
276 268
   
277
-  if (is.null(GRN@data$peaks$consensusPeaks)) {
269
+  if (is.null(GRN@data$peaks$counts_metadata)) {
278 270
     return(NA)
279 271
   }
280 272
   
281 273
   if (filter) {
282
-    nPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!.data$isFiltered) %>% nrow()
274
+    nPeaks = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered) %>% nrow()
283 275
   } else {
284
-    nPeaks = GRN@data$peaks$consensusPeaks %>% nrow()
276
+    nPeaks = GRN@data$peaks$counts_metadata %>% nrow()
285 277
   }
286 278
   
287 279
   nPeaks
... ...
@@ -310,14 +302,14 @@ nGenes <- function(GRN, filter = TRUE) {
310 302
   checkmate::assertClass(GRN, "GRN")
311 303
   checkmate::assertFlag(filter)
312 304
   
313
-  if (is.null(GRN@data$RNA$counts_norm.l)) {
305
+  if (is.null(GRN@data$RNA$counts_metadata)) {
314 306
     return(NA)
315 307
   }
316 308
   
317 309
   if (filter) {
318
-    nGenes = GRN@data$RNA$counts_norm.l[["0"]] %>% dplyr::filter(!.data$isFiltered) %>% nrow()
310
+    nGenes = GRN@data$RNA$counts_metadata %>% dplyr::filter(!.data$isFiltered) %>% nrow()
319 311
   } else {
320
-    nGenes = GRN@data$RNA$counts_norm.l[["0"]] %>% nrow()
312
+    nGenes = GRN@data$RNA$counts_metadata %>% nrow()
321 313
   }
322 314
   
323 315
   nGenes
... ...
@@ -344,10 +336,10 @@ nTFs <- function(GRN) {
344 336
     
345 337
     checkmate::assertClass(GRN, "GRN")
346 338
     
347
-    if (is.null(GRN@data$TFs$translationTable)) {
339
+    if (is.null(GRN@annotation$TFs)) {
348 340
         return(NA)
349 341
     }
350 342
     
351
-    nrow(GRN@data$TFs$translationTable)
343
+    nrow(GRN@annotation$TFs)
352 344
     
353 345
 }
354 346
\ No newline at end of file
... ...
@@ -166,6 +166,8 @@ initializeGRN <- function(objectMetadata = list(),
166 166
 #' # We omit sampleMetadata = meta.df in the following line, becomes too long otherwise
167 167
 #' # GRN = addData(GRN, counts_peaks = peaks.df, counts_rna = rna.df, forceRerun = FALSE)
168 168
 
169
+# TODO: add a isSingleCell argument or something similar
170
+
169 171
 addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", idColumn_peaks = "peakID", 
170 172
                     counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
171 173
                     allowOverlappingPeaks= FALSE,
... ...
@@ -185,14 +187,12 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
185 187
   checkmate::assertFlag(allowOverlappingPeaks)
186 188
   checkmate::assertFlag(forceRerun)
187 189
   
188
-  if (is.null(GRN@data$peaks$counts_norm) |
189
-      is.null(GRN@data$RNA$counts_norm.l) | 
190
+  if (is.null(GRN@data$peaks$counts) |
191
+      is.null(GRN@data$peaks$counts_metadata) | 
192
+      is.null(GRN@data$RNA$counts) |
193
+      is.null(GRN@data$RNA$counts_metadata) |
190 194
       forceRerun) {
191
-    
192
-    # Store raw peaks counts as DESeq object
193
-    
194
-    #GRN@data$peaks$counts_raw = counts_peaks %>% dplyr::mutate(isFiltered = FALSE)
195
-    
195
+  
196 196
     # Normalize ID column names
197 197
     if (idColumn_peaks != "peakID") {
198 198
       counts_peaks = dplyr::rename(counts_peaks, peakID = !!(idColumn_peaks))
... ...
@@ -269,16 +269,8 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
269 269
     
270 270
     # Subset data to retain only samples that appear in both RNA and peaks
271 271
     data.l = .intersectData(countsRNA.norm.df, countsPeaks.norm.df)
272
-    GRN@data$peaks$counts_norm = data.l[["peaks"]] %>% dplyr::mutate(isFiltered = FALSE)
273
-    countsRNA.norm.df = data.l[["RNA"]]  %>% dplyr::mutate(isFiltered = FALSE)
274
-    
275
-    futile.logger::flog.info(paste0( "Final dimensions of data:"))
276
-    futile.logger::flog.info(paste0( " RNA  : ", nrow(countsRNA.norm.df)  , " x ", ncol(countsRNA.norm.df)   - 2, " (rows x columns)"))
277
-    futile.logger::flog.info(paste0( " peaks: ", nrow(countsPeaks.norm.df), " x ", ncol(countsPeaks.norm.df) - 1, " (rows x columns)"))
278
-    # Create permutations for RNA
279
-    futile.logger::flog.info(paste0( "Generate ", .getMaxPermutation(GRN), " permutations of RNA-counts"))
280
-    GRN@data$RNA$counts_norm.l = .shuffleColumns(countsRNA.norm.df, .getMaxPermutation(GRN), returnUnshuffled = TRUE, returnAsList = TRUE)
281 272
     
273
+    # Generate metadata first to determine the nmberof shared samples etc
282 274
     if (!is.null(sampleMetadata)) {
283 275
       
284 276
       futile.logger::flog.info("Parsing provided metadata...")
... ...
@@ -317,72 +309,35 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
317 309
     
318 310
     GRN@config$sharedSamples = dplyr::filter(GRN@data$metadata, .data$has_both) %>% dplyr::pull(.data$sampleID) %>% as.character()
319 311
     
320
-    counts_peaks = as.data.frame(counts_peaks)
321
-    counts_rna = as.data.frame(counts_rna)
322
-    rownames(counts_peaks) = counts_peaks$peakID
323
-    rownames(counts_rna)  = counts_rna$ENSEMBL
324
-    
312
+    ### COUNT MATRICES
313
+    # Store the matrices either as normal or sparse matrix
325 314
     
326
-    futile.logger::flog.info("Storing count matrices...")
327
-    # Store the raw peaks data efficiently as DESeq object only if it contains only integers, otherwise store as normal matrix
315
+    GRN@data$peaks$counts = .storeAsMatrixOrSparseMatrix(GRN, df = data.l[["peaks"]], ID_column = "peakID", slotName = "GRN@data$peaks$counts")
316
+
317
+    # includeFiltered = TRUE here as it doesnt make a difference and because getCounts requires counts_metadata$isFiltered to be set already
318
+    GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = FALSE, includeFiltered = TRUE)) 
319
+    stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata))
328 320
     
329
-    if (isIntegerMatrix(counts_peaks[, GRN@config$sharedSamples])) {
330
-      GRN@data$peaks$counts_orig = DESeq2::DESeqDataSetFromMatrix(counts_peaks[, GRN@config$sharedSamples], 
331
-                                                                  colData = dplyr::filter(GRN@data$metadata, .data$has_both) %>% tibble::column_to_rownames("sampleID"),
332
-                                                                  design = ~1)
333
-    } else {
334
-        
335
-      # Store as sparse matrix if enough 0s
336
-      rowSample = min(100, nrow(counts_peaks))
337
-      zeroEntries = length(which(counts_peaks[1:rowSample, GRN@config$sharedSamples] %>% unlist(use.names = FALSE)  == 0))
338
-      fractionZero = zeroEntries / (rowSample * length(GRN@config$sharedSamples)) 
339
-      if (fractionZero> 0.1) {
340
-          futile.logger::flog.info(paste0("Storing GRN@data$peaks$counts_orig matrix as sparse matrix because fraction of 0s is > 0.1 (", fractionZero, ")"))
341
-          GRN@data$peaks$counts_orig = .asSparseMatrix(as.matrix(counts_peaks %>% dplyr::select(-.data$peakID)), 
342
-                                                       convertNA_to_zero = FALSE, 
343
-                                                       dimnames = list(counts_peaks$peakID, colnames(GRN@config$sharedSamples)))
344
-      } else {
345
-          GRN@data$peaks$counts_orig = as.matrix(counts_peaks[, GRN@config$sharedSamples])
346
-      }
321
+    GRN@data$RNA$counts   = .storeAsMatrixOrSparseMatrix(GRN, df =  data.l[["RNA"]], ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts")
347 322
       
348
-     
349
-    }
323
+    GRN@data$RNA$counts_metadata = tibble::tibble(ID = data.l[["RNA"]]$ENSEMBL, isFiltered = FALSE)
350 324
     
351
-    if (isIntegerMatrix(counts_rna[, GRN@config$sharedSamples])) {
352
-      GRN@data$RNA$counts_orig = DESeq2::DESeqDataSetFromMatrix(counts_rna[, GRN@config$sharedSamples], 
353
-                                                                colData = dplyr::filter(GRN@data$metadata, .data$has_both) %>% tibble::column_to_rownames("sampleID"),
354
-                                                                design = ~1)   
355
-    } else {
356
-      
357
-      
358
-      # Store as sparse matrix if enough 0s
359
-      rowSample = min(100, nrow(counts_rna))
360
-      zeroEntries = length(which(counts_rna[1:rowSample, GRN@config$sharedSamples] %>% unlist(use.names = FALSE)  == 0))
361
-      fractionZero = zeroEntries / (rowSample * length(GRN@config$sharedSamples)) 
362
-      if (fractionZero> 0.1) {
363
-          futile.logger::flog.info(paste0("Storing GRN@data$RNA$counts_orig matrix as sparse matrix because fraction of 0s is > 0.1 (", fractionZero, ")"))
364
-          GRN@data$RNA$counts_orig = .asSparseMatrix(as.matrix(counts_rna %>% dplyr::select(-.data$ENSEMBL)), 
365
-                                                       convertNA_to_zero = FALSE, 
366
-                                                       dimnames = list(counts_rna$ENSEMBL, colnames(GRN@config$sharedSamples)))
367
-      } else {
368
-          GRN@data$RNA$counts_orig = as.matrix(counts_rna[, GRN@config$sharedSamples])
369
-      }
370
-    }
325
+    GRN@data$RNA$counts_permuted_index = .shuffleColumns(countsRNA.norm.df %>% dplyr::select(-.data$ENSEMBL), .getMaxPermutation(GRN), returnUnshuffled = FALSE, returnAsList = FALSE)
371 326
     
372
-    futile.logger::flog.info(paste0("Creating consensus peaks and check for overlapping peaks..."))
373
-    
374
-    # Consensus peaks
375
-    GRN@data$peaks$consensusPeaks = .createConsensusPeaksDF(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)) 
376
-    GRN@data$peaks$consensusPeaks = GRN@data$peaks$consensusPeaks[match(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID, GRN@data$peaks$consensusPeaks$peakID), ]
377
-    stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$consensusPeaks))
327
+    futile.logger::flog.info(paste0( "Final dimensions of data:"))
328
+    futile.logger::flog.info(paste0( " RNA  : ", nrow(countsRNA.norm.df)  , " x ", ncol(countsRNA.norm.df)   - 2, " (rows x columns)"))
329
+    futile.logger::flog.info(paste0( " peaks: ", nrow(countsPeaks.norm.df), " x ", ncol(countsPeaks.norm.df) - 1, " (rows x columns)"))
330
+    # Create permutations for RNA
331
+    futile.logger::flog.info(paste0( "Generate ", .getMaxPermutation(GRN), " permutations of RNA-counts"))
378 332
     
379 333
     
334
+    futile.logger::flog.info(paste0("Creating consensus peaks and check for overlapping peaks..."))
380 335
     
336
+    # Consensus peaks
337
+
381 338
     # Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details
382
-    consensus.gr   = .constructGRanges(GRN@data$peaks$consensusPeaks, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE)
383
-    
384
-    
385
-    
339
+    consensus.gr   = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE)
340
+
386 341
     overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) >1)
387 342
     
388 343
     if (length(overlappingPeaks) > 0) {
... ...
@@ -421,7 +376,31 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
421 376
   
422 377
 }
423 378
 
379
+.storeAsMatrixOrSparseMatrix <- function (GRN, df, ID_column, slotName, threshold = 0.1) {
380
+    
381
+    stopifnot(identical(GRN@config$sharedSamples, colnames(df)[-1]))
382
+    
383
+    # Store as sparse matrix if enough 0s
384
+    checkmate::assertIntegerish(length(GRN@config$sharedSamples), lower = 1)
424 385
 
386
+    df.m = df %>% 
387
+        dplyr::select(tidyselect::one_of(ID_column, GRN@config$sharedSamples)) %>% 
388
+        tibble::column_to_rownames(ID_column) %>%
389
+        as.matrix()
390
+    
391
+    # Determine sparsity
392
+    fractionZero = (length(df.m) - Matrix::nnzero(df.m)) / length(df.m)
393
+    
394
+    
395
+    if (fractionZero > threshold) {
396
+        futile.logger::flog.info(paste0("Storing ", slotName, " matrix as sparse matrix because fraction of 0s is > ", threshold, " (", fractionZero, ")"))
397
+        df.m = .asSparseMatrix(df.m, convertNA_to_zero = FALSE, 
398
+                               dimnames = list(df[, ID_column] %>% unlist(use.names = FALSE), GRN@config$sharedSamples))
399
+    } 
400
+    
401
+    df.m
402
+    
403
+}
425 404
 
426 405
 .createConsensusPeaksDF <- function(countsPeaks, idColumn = "peakID") {
427 406
   
... ...
@@ -439,11 +418,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
439 418
                                 start = as.numeric(sapply(ids.split, "[[", 2)), 
440 419
                                 end   = as.numeric(sapply(ids.split, "[[", 3)),
441 420
                                 peakID = paste0(.data$chr, ":", .data$start, "-", .data$end),
442
-                                isFiltered = FALSE) %>%
443
-    dplyr::arrange(.data$chr, .data$start)
444
-  
445
-  # The sorting is necessary here for subsequent bedtools to run faster or at all
446
-  
421
+                                isFiltered = FALSE)
447 422
   consensus.df
448 423
 }
449 424
 
... ...
@@ -503,52 +478,23 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
503 478
     
504 479
 }
505 480
 
506
-.getAllGeneTypesAndFrequencies <- function(genomeAssembly, verbose = TRUE) {
507
-  
508
-  geneAnnotation.df = .retrieveAnnotationData(genomeAssembly)
509
-  return(table(geneAnnotation.df$gene.type))
510
-  
511
-}
512
-
513
-.getKnownGeneAnnotationNew <- function(GRN, gene.types, extendRegions = NULL) {
514
-  
515
-  checkmate::assertCharacter(GRN@config$parameters$genomeAssembly, len = 1)
516
-  
517
-  if (!is.null(extendRegions)) {
518
-    stop("Not yet implemented")
519
-  }
520
-  
521
-  genes.filt.df = .retrieveAnnotationData(GRN@config$parameters$genomeAssembly)
522
-  
523
-  if (!is.null(gene.types)) {
524
-    if (! "all" %in% gene.types) {
525
-      genes.filt.df = dplyr::filter(genes.filt.df, .data$gene.type %in% gene.types)
526
-    }
527
-  }
528
-  
529
-  
530
-  genes.filt.df
531
-  
532
-}
533 481
 
534 482
 .populatePeakAnnotation <- function (GRN) {
535 483
   
536
-  countsPeaks.clean = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
484
+  countsPeaks.clean = getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = TRUE, includeFiltered = TRUE)
537 485
   
538 486
   futile.logger::flog.info(paste0(" Calculate statistics for each peak (mean and CV)"))
539 487
   
540
-  countsPeaks.m = as.matrix(dplyr::select(countsPeaks.clean, -.data$peakID))
541
-  
542
-  rowMeans_peaks   = rowMeans(countsPeaks.m)
543
-  rowMedians_peaks = matrixStats::rowMedians(countsPeaks.m)
544
-  CV_peaks = matrixStats::rowSds(countsPeaks.m) /  rowMeans_peaks
488
+  rowMeans_peaks   = rowMeans(countsPeaks.clean)
489
+  rowMedians_peaks = matrixStats::rowMedians(countsPeaks.clean)
490
+  CV_peaks = matrixStats::rowSds(countsPeaks.clean) /  rowMeans_peaks
545 491
   
546
-  metadata_peaks = tibble::tibble(peak.ID = countsPeaks.clean$peakID, 
492
+  metadata_peaks = tibble::tibble(peak.ID = rownames(countsPeaks.clean), 
547 493
                                   peak.mean = rowMeans_peaks, 
548 494
                                   peak.median = rowMedians_peaks, 
549 495
                                   peak.CV = CV_peaks)
550 496
   
551
-  GRN@annotation$consensusPeaks = metadata_peaks
497
+  GRN@annotation$peaks = metadata_peaks
552 498
 
553 499
   if (!is.installed("ChIPseeker")) {
554 500
       packageMessage = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", 
... ...
@@ -558,8 +504,8 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
558 504
     
559 505
     futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while"))
560 506
     genomeAssembly = GRN@config$parameters$genomeAssembly
561
-    consensusPeaks     = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!.data$isFiltered)
562
-    consensusPeaks.gr  = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
507
+    consensusPeaks     = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered)
508
+    consensusPeaks.gr  = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
563 509
     
564 510
     # Add ChIPSeeker anotation
565 511
     peaks.annotated = suppressMessages(ChIPseeker::annotatePeak(
... ...
@@ -579,13 +525,13 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
579 525
       verbose = TRUE # the default
580 526
     ))
581 527
     
582
-    GRN@annotation$consensusPeaks_obj = peaks.annotated
528
+    GRN@annotation$peaks_obj = peaks.annotated
583 529
     
584 530
     peaks.annotated.df = as.data.frame(peaks.annotated)
585 531
     peaks.annotated.df$annotation[grepl("Exon", peaks.annotated.df$annotation)] = "Exon"
586 532
     peaks.annotated.df$annotation[grepl("Intron", peaks.annotated.df$annotation)] = "Intron"
587 533
     
588
-    GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, 
534
+    GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, 
589 535
                                                      peaks.annotated.df  %>% 
590 536
                                                        dplyr::select(.data$peakID, .data$annotation, tidyselect::starts_with("gene"), -.data$geneId, .data$distanceToTSS, .data$ENSEMBL, .data$SYMBOL, .data$GENENAME) %>%
591 537
                                                        dplyr::mutate(annotation  = as.factor(.data$annotation), 
... ...
@@ -621,11 +567,10 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
621 567
 .populateGeneAnnotation <- function (GRN) {
622 568
 
623 569
 
624
-  countsRNA.clean  = getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE)
570
+  countsRNA.m  = getCounts(GRN, type = "rna", permuted = FALSE, asMatrix = TRUE, includeFiltered = TRUE)
625 571
   
626
-  futile.logger::flog.info(paste0(" Calculate statistics for each of the ", nrow(countsRNA.clean), " genes that were provided with the RNA-seq data (mean and CV)"))
572
+  futile.logger::flog.info(paste0(" Calculate statistics for each of the ", nrow(countsRNA.m), " genes that were provided with the RNA-seq data (mean and CV)"))
627 573
   
628
-  countsRNA.m = as.matrix(dplyr::select(countsRNA.clean, -.data$ENSEMBL))
629 574
   
630 575
   rowMeans_rna = rowMeans(countsRNA.m)
631 576
   rowMedians_rna = matrixStats::rowMedians(countsRNA.m)
... ...
@@ -633,7 +578,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
633 578
   
634 579
   genomeAnnotation.df = .retrieveAnnotationData(GRN@config$parameters$genomeAssembly)
635 580
   
636
-  metadata_rna = tibble::tibble(gene.ENSEMBL = countsRNA.clean$ENSEMBL, 
581
+  metadata_rna = tibble::tibble(gene.ENSEMBL = rownames(countsRNA.m), 
637 582
                                 gene.mean = rowMeans_rna, 
638 583
                                 gene.median = rowMedians_rna, 
639 584
                                 gene.CV = CV_rna) %>%
... ...
@@ -663,7 +608,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
663 608
   genome = .getGenomeObject(genomeAssembly, type = "BSgenome")
664 609
   
665 610
   # Get peaks as GRanges object
666
-  query   = .constructGRanges(GRN@data$peaks$consensusPeaks, 
611
+  query   = .constructGRanges(GRN@data$peaks$counts_metadata, 
667 612
                               seqlengths = .getChrLengths(genomeAssembly), 
668 613
                               genomeAssembly)
669 614
   
... ...
@@ -690,7 +635,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
690 635
   
691 636
   #ggplot2::ggplot(GC_classes.df , ggplot2::aes(GC.class, n_rel)) + geom_bar(stat = "identity") + ggplot2::theme_bw()
692 637
   
693
-  GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, GC_content.df, by = "peak.ID") %>%
638
+  GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC_content.df, by = "peak.ID") %>%
694 639
     dplyr::rename( peak.GC.perc    = .data$`G|C`,
695 640
                    peak.width      = .data$length,
696 641
                    peak.GC.class   = .data$GC_class)
... ...
@@ -708,7 +653,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
708 653
 #' This function marks genes and/or peaks as \code{filtered} depending on the chosen filtering criteria. Filtered genes / peaks will then be 
709 654
 #' disregarded when adding connections in subsequent steps via \code{\link{addConnections_TF_peak}} and  \code{\link{addConnections_peak_gene}}. \strong{This function does NOT (re)filter existing connections when the \code{\linkS4class{GRN}} object already contains connections. Thus, upon re-execution of this function with different filtering criteria, all downstream steps have to be re-run.}
710 655
 #' 
711
-#' All this function does is setting (or modifying) the filtering flag in \code{GRN@data$peaks$counts_norm} and \code{GRN@data$RNA$counts_norm.l}, respectively.
656
+#' All this function does is setting (or modifying) the filtering flag in \code{GRN@data$peaks$counts_metadata} and \code{GRN@data$RNA$counts_metadata}, respectively.
712 657
 #' 
713 658
 #' @template GRN 
714 659
 #' @param minNormalizedMean_peaks Numeric[0,] or \code{NULL}. Default 5. Minimum mean across all samples for a peak to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
... ...
@@ -750,7 +695,7 @@ filterData <- function (GRN,
750 695
   checkmate::assertNumber(maxNormalizedMean_peaks, lower = minNormalizedMean_peaks , null.ok = TRUE)
751 696
   checkmate::assertNumber(maxNormalizedMeanRNA, lower = minNormalizedMeanRNA, null.ok = TRUE)
752 697
   checkmate::assertCharacter(chrToKeep_peaks, min.len = 1, any.missing = FALSE, null.ok = TRUE)
753
-  checkmate::assertSubset(chrToKeep_peaks, GRN@data$peaks$consensusPeaks %>% dplyr::pull(.data$chr) %>% unique() %>% as.character())
698
+  checkmate::assertSubset(chrToKeep_peaks, GRN@data$peaks$counts_metadata %>% dplyr::pull(.data$chr) %>% unique() %>% as.character())
754 699
   
755 700
   checkmate::assertIntegerish(minSize_peaks, lower = 1, null.ok = TRUE)
756 701
   checkmate::assertIntegerish(maxSize_peaks, lower = dplyr::if_else(is.null(minSize_peaks), 1, minSize_peaks), null.ok = TRUE)
... ...
@@ -762,8 +707,7 @@ filterData <- function (GRN,
762 707
   
763 708
   if (!GRN@config$isFiltered | forceRerun) {
764 709
     
765
-    GRN@data$peaks$consensusPeaks$isFiltered = FALSE
766
-    GRN@data$peaks$counts_norm$isFiltered = FALSE
710
+    GRN@data$peaks$counts_metadata$isFiltered = FALSE
767 711
     
768 712
     if(!is.null(GRN@data$TFs$TF_peak_overlap)) {
769 713
       GRN@data$TFs$TF_peak_overlap[, "isFiltered"] = 0
... ...
@@ -782,7 +726,7 @@ filterData <- function (GRN,
782 726
                                                   chrToKeep_peaks, 
783 727
                                                   minSize_peaks = minSize_peaks, maxSize_peaks = maxSize_peaks)
784 728
     
785
-    nPeaksBefore = nrow(GRN@data$peaks$consensusPeaks)
729
+    nPeaksBefore = nrow(GRN@data$peaks$counts_metadata)
786 730
     peaks_toKeep = intersect(peakIDs.chr, peakIDs.CV)
787 731
     futile.logger::flog.info(paste0("Collectively, filter ", nPeaksBefore -length(peaks_toKeep), " out of ", nPeaksBefore, " peaks."))
788 732
     futile.logger::flog.info(paste0("Number of remaining peaks: ", length(peaks_toKeep)))
... ...
@@ -792,9 +736,9 @@ filterData <- function (GRN,
792 736
         .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
793 737
     }
794 738
     
795
-    GRN@data$peaks$consensusPeaks$isFiltered  = ! GRN@data$peaks$consensusPeaks$peakID  %in% peaks_toKeep
739
+    GRN@data$peaks$counts_metadata$isFiltered  = ! GRN@data$peaks$counts_metadata$peakID  %in% peaks_toKeep
796 740
     #GRN@data$peaks$counts_raw$isFiltered = ! GRN@data$peaks$counts_raw$peakID  %in% peaks_toKeep
797
-    GRN@data$peaks$counts_norm$isFiltered = ! GRN@data$peaks$counts_norm$peakID  %in% peaks_toKeep
741
+    GRN@data$peaks$counts_metadata$isFiltered = ! GRN@data$peaks$counts_metadata$peakID  %in% peaks_toKeep
798 742
     
799 743
     
800 744
     if(!is.null(GRN@data$TFs$TF_peak_overlap)) {
... ...
@@ -816,12 +760,10 @@ filterData <- function (GRN,
816 760
         .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
817 761
     }
818 762
     
819
-    for (permutationCur in c(0)) {
820
-      permIndex = as.character(permutationCur)
821
-      rowMeans = rowMeans(dplyr::select(GRN@data$RNA$counts_norm.l[[permIndex]], -.data$ENSEMBL))
822
-      GRN@data$RNA$counts_norm.l[[permIndex]]$isFiltered = rowMeans < minNormalizedMeanRNA 
823
-    }
824
-    nRowsFlagged = length(which(GRN@data$RNA$counts_norm.l[["0"]]$isFiltered))
763
+    rowMeans = rowMeans(getCounts(GRN, type = "rna", asMatrix = TRUE, includeFiltered = TRUE))
764
+    GRN@data$RNA$counts_metadata$isFiltered = rowMeans < minNormalizedMeanRNA 
765
+
766
+    nRowsFlagged = length(which(GRN@data$RNA$counts_metadata$isFiltered))
825 767
     
826 768
     # Raw counts are left untouched and filtered where needed only
827 769
     futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows because the row mean was smaller than ", minNormalizedMeanRNA))
... ...
@@ -844,26 +786,17 @@ filterData <- function (GRN,
844 786
   }
845 787
   
846 788
   if (is.null(chrToKeep)) {
847
-    chrToKeep = GRN@data$peaks$consensusPeaks %>% dplyr::pull(.data$chr) %>% unique()
789
+    chrToKeep = GRN@data$peaks$counts_metadata %>% dplyr::pull(.data$chr) %>% unique()
848 790
   } else {
849 791
     futile.logger::flog.info(paste0("Filter and sort peaks and remain only those on the following chromosomes: ", paste0(chrToKeep, collapse = ",")))
850 792
   }
851 793
   
852 794
 
853 795
   futile.logger::flog.info(paste0("Filter and sort peaks by size and remain only those smaller than : ", maxSize_peaks))
854
-  futile.logger::flog.info(paste0(" Number of peaks before filtering: ", nrow(GRN@data$peaks$consensusPeaks)))
855
-  ids = strsplit(GRN@data$peaks$consensusPeaks %>% dplyr::pull(!!(idColumn)), split = ":", fixed = TRUE)
856
-  ids_chr = sapply(ids, "[[", 1)
857
-  ids_pos = sapply(ids, "[[", 2)
858
-  ids_pos = strsplit(ids_pos, split = "-", fixed = TRUE)
859
-  start = sapply(ids_pos, "[[", 1)
860
-  end   = sapply(ids_pos, "[[", 2)
796
+  futile.logger::flog.info(paste0(" Number of peaks before filtering: ", nrow(GRN@data$peaks$counts_metadata)))
861 797
   
862
-  countsPeaks.clean = GRN@data$peaks$consensusPeaks %>%
863
-    dplyr::mutate(chr = ids_chr,
864
-                  start = as.numeric(start),
865
-                  end = as.numeric(end),
866
-                  size = end-start) %>%
798
+  countsPeaks.clean = GRN@data$peaks$counts_metadata %>%
799
+    dplyr::mutate(size = end-start) %>%
867 800
     dplyr::filter(.data$chr %in% chrToKeep, .data$size <= maxSize_peaks, .data$size >= minSize_peaks) %>%
868 801
     # arrange(chr, start) %>%
869 802
     dplyr::rename(peakID = !!(idColumn)) %>%
... ...
@@ -883,12 +816,12 @@ filterData <- function (GRN,
883 816
   if (is.null(maxCV)) {
884 817
     futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV))
885 818
     
886
-    peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, .data$peak.CV >= minCV)
819
+    peaksFiltered = dplyr::filter(GRN@annotation$peaks, .data$peak.CV >= minCV)
887 820
     
888 821
   } else {
889 822
     futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV, ", max CV = ", maxCV))
890 823
     
891
-    peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, .data$peak.CV >= minCV, .data$peak.CV <= maxCV)
824
+    peaksFiltered = dplyr::filter(GRN@annotation$peaks, .data$peak.CV >= minCV, .data$peak.CV <= maxCV)
892 825
   }
893 826
   
894 827
   futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
... ...
@@ -902,7 +835,7 @@ filterData <- function (GRN,
902 835
   
903 836
   startTime = Sys.time()
904 837
   
905
-  futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$consensusPeaks)))
838
+  futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$peaks)))
906 839
   
907 840
   if (is.null(minCV)) {
908 841
     minCV = 0
... ...
@@ -931,7 +864,7 @@ filterData <- function (GRN,
931 864
   }   
932 865
   
933 866
   
934
-  peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, 
867
+  peaksFiltered = dplyr::filter(GRN@annotation$peaks, 
935 868
                                 .data$peak.CV >= minCV, .data$peak.CV <= maxCV, 
936 869
                                 .data$peak.mean >= minMean, .data$peak.mean <= maxMean)
937 870
   
... ...
@@ -1026,7 +959,7 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
1026 959
     
1027 960
     GRN@config$TFBS_fileEnding  = fileEnding
1028 961
     GRN@config$TFBS_filePattern = filesTFBSPattern
1029
-    GRN@annotation$TFs = .getFinalListOfTFs(motifFolder, filesTFBSPattern, fileEnding, TFs, nTFMax, getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE))
962
+    GRN@annotation$TFs = .getFinalListOfTFs(motifFolder, filesTFBSPattern, fileEnding, TFs, nTFMax, getCounts(GRN, type = "rna", permuted = FALSE))
1030 963
     
1031 964
     
1032 965
     GRN@annotation$TFs = GRN@annotation$TFs %>%
... ...
@@ -1162,7 +1095,7 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
1162 1095
     
1163 1096
     
1164 1097
     # Check whether we have peaks on chromosomes not part of the sequence length reference. If yes, discard them
1165
-    annotation_discared = dplyr::filter(GRN@data$peaks$consensusPeaks, ! .data$chr %in% names(seqlengths))
1098
+    annotation_discared = dplyr::filter(GRN@data$peaks$counts_metadata, ! .data$chr %in% names(seqlengths))
1166 1099
     
1167 1100
     if (nrow(annotation_discared) > 0) {
1168 1101
       
... ...
@@ -1174,11 +1107,11 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
1174 1107
                        paste0(names(tbl_discarded), " (", tbl_discarded, ")", collapse = ","))
1175 1108
       .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)  
1176 1109
       
1177
-      GRN@data$peaks$consensusPeaks = dplyr::filter(GRN@data$peaks$consensusPeaks, .data$chr %in% names(seqlengths))
1110
+      GRN@data$peaks$counts_metadata = dplyr::filter(GRN@data$peaks$counts_metadata, .data$chr %in% names(seqlengths))
1178 1111
     }
1179 1112
     
1180 1113
     # Construct GRanges
1181
-    consensus.gr   = .constructGRanges(GRN@data$peaks$consensusPeaks, seqlengths = seqlengths, genomeAssembly)
1114
+    consensus.gr   = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = seqlengths, genomeAssembly)
1182 1115
     
1183 1116
     res.l = .execInParallelGen(nCores, returnAsList = TRUE, listNames = GRN@config$allTF, 
1184 1117
                                iteration = seq_len(length(GRN@config$allTF)), 
... ...
@@ -1199,22 +1132,22 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
1199 1132
     }
1200 1133
     
1201 1134
     # new 
1202
-    filteredPeaks = dplyr::filter(GRN@data$peaks$counts_norm, .data$isFiltered) %>% dplyr::pull(.data$peakID)
1135
+    filteredPeaks = dplyr::filter(GRN@data$peaks$counts_metadata, .data$isFiltered) %>% dplyr::pull(.data$peakID)
1203 1136
     # Collect binary 0/1 binding matrix from all TF and concatenate
1204 1137
     GRN@data$TFs$TF_peak_overlap = TFBS_bindingMatrix.df %>%
1205
-      dplyr::mutate(peakID = GRN@data$peaks$consensusPeaks$peakID,
1138
+      dplyr::mutate(peakID = GRN@data$peaks$counts_metadata$peakID,
1206 1139
                     isFiltered = .data$peakID %in% filteredPeaks) %>%
1207 1140
       dplyr::mutate_if(is.logical, as.numeric) %>%
1208 1141
       dplyr::select(tidyselect::all_of(sort(GRN@config$allTF)), .data$isFiltered)
1209 1142
     
1210 1143
     GRN@data$TFs$TF_peak_overlap = .asSparseMatrix(as.matrix(GRN@data$TFs$TF_peak_overlap), 
1211 1144
                                                    convertNA_to_zero = FALSE, 
1212
-                                                   dimnames = list(GRN@data$peaks$consensusPeaks$peakID, colnames(GRN@data$TFs$TF_peak_overlap)))
1145
+                                                   dimnames = list(GRN@data$peaks$counts_metadata$peakID, colnames(GRN@data$TFs$TF_peak_overlap)))
1213 1146
     
1214 1147
     # The order of rows is here the sorted version as it originates from the sorted consensus peak file
1215 1148
     # We resort it to match the countsPeaks.norm
1216 1149
     # TODO: Here could be an error due to differences in sorting. Also, whether or not all or the filtered version shall be used
1217
-    stopifnot(identical(rownames(GRN@data$TFs$TF_peak_overlap), GRN@data$peaks$counts_norm$peakID))
1150
+    stopifnot(identical(rownames(GRN@data$TFs$TF_peak_overlap), GRN@data$peaks$counts_metadata$peakID))
1218 1151
     
1219 1152
   } 
1220 1153
   
... ...
@@ -1274,7 +1207,7 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
1274 1207
   futile.logger::flog.info(paste0(" Calculating intersection for TF ", TFCur, " finished. Number of overlapping TFBS after filtering: ", nrow(final.df)))
1275 1208
   
1276 1209
   
1277
-  return(GRN@data$peaks$consensusPeaks$peakID %in% final.df$peakID)
1210
+  return(GRN@data$peaks$counts_metadata$peakID %in% final.df$peakID)
1278 1211
   
1279 1212
 }
1280 1213
 
... ...
@@ -1415,6 +1348,9 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1415 1348
   checkmate::assertCharacter(name, min.chars = 1, len = 1)
1416 1349
   checkmate::assertFlag(forceRerun)
1417 1350
   
1351
+  message = paste0("This function is currently under development and testing and not fully functional yet.")
1352
+  .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
1353
+  
1418 1354
   forbiddenNames = "expression"
1419 1355
   .checkForbiddenNames(name, forbiddenNames)
1420 1356
   
... ...
@@ -1424,7 +1360,8 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1424 1360
     
1425 1361
     
1426 1362
     # Input: Raw peak counts per TF and TF-binding matrix
1427
-    counts.df = getCounts(GRN, type = "peaks", norm = FALSE, permuted = FALSE) %>%
1363
+    # TODO: How to add raw counts here
1364
+    counts.df = getCounts(GRN, type = "peaks", permuted = FALSE) %>%
1428 1365
       tibble::as_tibble()
1429 1366
     
1430 1367
     # TODO replace by getter
... ...
@@ -1722,7 +1659,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1722 1659
   outputFolder = .checkOutputFolder(GRN, outputFolder)
1723 1660
   
1724 1661
   GRN@data$TFs$classification$TF.translation.orig = GRN@annotation$TFs %>%
1725
-    dplyr::mutate(TF.name = .data$HOCOID)
1662
+    dplyr::mutate(TF.name = .data$TF.HOCOID)
1726 1663
   
1727 1664
   if (is.null(GRN@data$TFs$TF_peak_overlap)) {
1728 1665
     message = paste0("Could not find peak - TF matrix. Run the function overlapPeaksAndTFBS first / again")
... ...
@@ -1763,7 +1700,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1763 1700
         GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]] = list()
1764 1701
         
1765 1702
         if (connectionTypeCur == "expression") {
1766
-          counts1 = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))
1703
+          counts1 = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))
1767 1704
           
1768 1705
         } else {
1769 1706
           
... ...
@@ -1775,7 +1712,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1775 1712
         
1776 1713
         futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts"))
1777 1714
         
1778
-        counts_peaks = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
1715
+        counts_peaks = getCounts(GRN, type = "peaks", permuted = FALSE)
1779 1716
         
1780 1717
         TF_peak_cor = .correlateMatrices(matrix1      = counts1, 
1781 1718
                                          matrix_peaks = counts_peaks, 
... ...
@@ -1792,6 +1729,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1792 1729
       }
1793 1730
       
1794 1731
       # Final classification: Calculate thresholds by calculating the quantiles of the background and compare the real values to the background
1732
+      # TODO: Clarify whether the default convertZero_to_NA=TRUE is really needed here
1795 1733
       if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l) | forceRerun) {
1796 1734
         GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l = 
1797 1735
           .calculate_classificationThresholds(.asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), 
... ...
@@ -1905,8 +1843,6 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1905 1843
 
1906 1844
 ######## Connections ########
1907 1845
 
1908
-# TODO: WHere in the code TFs that are not available for expression are filtered?
1909
-
1910 1846
 #' Add TF-peak connections to a \code{\linkS4class{GRN}} object
1911 1847
 #' 
1912 1848
 #' After the execution of this function, QC plots can be plotted with the function \code{\link{plotDiagnosticPlots_TFPeaks}} unless this has already been done by default due to \code{plotDiagnosticPlots = TRUE}
... ...
@@ -1998,6 +1934,9 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1998 1934
       GRN@connections$TF_peaks[[permIndex]]$main            = .optimizeSpaceGRN(stats::na.omit(resFDR.l[["main"]]))
1999 1935
       GRN@connections$TF_peaks[[permIndex]]$connectionStats = resFDR.l[["connectionStats"]] 
2000 1936
       
1937
+      futile.logger::flog.info(paste0("Finished. Stored ", nrow(GRN@connections$TF_peaks[[permIndex]]$main), " connections with an FDR <= ", maxFDRToStore))
1938
+      
1939
+      
2001 1940
       # GC plots, empty when no GC correction should be done
2002 1941
       GRN@stats$plots_GC = resFDR.l[["plots_GC"]]
2003 1942
       rm(resFDR.l)
... ...
@@ -2023,9 +1962,10 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2023 1962
 
2024 1963
 
2025 1964
 .computeTF_peak.fdr <- function(GRN, perm, connectionTypes, corMethod = "pearson", useGCCorrection = FALSE, 
2026
-                                removeNegativeCorrelation, maxFDRToStore = 0.3 , percBackground_size = 75, percBackground_resample = TRUE, plotDetails = FALSE) {
1965
+                                removeNegativeCorrelation, maxFDRToStore = 0.3 , percBackground_size = 75, percBackground_resample = TRUE, plotDetails = FALSE, backgroundSize_min = 1000) {
2027 1966
   
2028 1967
   start = Sys.time()
1968
+  checkmate::assertIntegerish(backgroundSize_min, lower = 100)
2029 1969
   
2030 1970
   if (plotDetails) {
2031 1971
     message = "Plotting details is not supported currently. Set plotDetails = FALSE."
... ...
@@ -2050,7 +1990,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2050 1990
     
2051 1991
     if (connectionTypeCur == "expression") {
2052 1992
       
2053
-      counts_connectionTypeCur = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))
1993
+      counts_connectionTypeCur = getCounts(GRN, type = "rna", permuted = as.logical(perm))
2054 1994
       
2055 1995
     } else {
2056 1996
       
... ...
@@ -2062,7 +2002,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2062 2002
     
2063 2003
     futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts"))
2064 2004
     
2065
-    counts_peaks = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
2005
+    counts_peaks = getCounts(GRN, type = "peaks", permuted = FALSE)
2066 2006
     
2067 2007
     # Filtering of the matrices happens automatically within the next function
2068 2008
     peaksCor.m = .correlateMatrices( matrix1= counts_connectionTypeCur, 
... ...
@@ -2095,7 +2035,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2095 2035
     
2096 2036
     pb <- progress::progress_bar$new(total = length(allTF))
2097 2037
     
2098
-    peaksFiltered = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!.data$isFiltered) 
2038
+    peaksFiltered = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered) 
2099 2039
     
2100 2040
     if (!useGCCorrection) {
2101 2041
       minPerc = 100
... ...
@@ -2164,9 +2104,8 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2164 2104
         
2165 2105
         targetNoPeaks = minPerc/100 * nPeaksBackground
2166 2106
         
2167
-        # TODO: Make 1000 a parameter stored somewhere
2168 2107
         # Ensure a minimum no of points in background, even if this sacrifices the mimicking of the distributions
2169
-        if (targetNoPeaks < 1000) {
2108
+        if (targetNoPeaks < backgroundSize_min) {
2170 2109
           
2171 2110
           if (!is.null(percBackground_size)) {
2172 2111
             #futile.logger::flog.warn(paste0("Number of peaks in background is smaller than 1000 for TF ", TFCur, ". Increasing in steps of 5% until a value > 1000 is found.This warning results from a too low value for the parameter percBackground_size"))
... ...
@@ -2401,15 +2340,13 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2401 2340
       
2402 2341
     } # end for each TF
2403 2342
     
2404
-    futile.logger::flog.info(paste0("Stored ", nrow(tblFilt.df), " connections with an FDR <= ", maxFDRToStore))
2405
-    
2406 2343
     
2407 2344
     .printExecutionTime(start2, prefix = "  ")
2408 2345
     
2409 2346
   } # end for each connectionType
2410 2347
   
2411 2348
   
2412
-  .printExecutionTime(start)
2349
+  .printExecutionTime(start, prefix = "")
2413 2350
   list(main            = data.table::rbindlist(connections_all.l), 
2414 2351
        connectionStats = data.table::rbindlist(connectionStats_all.l), 
2415 2352
        plots_GC        = plots_GC.l
... ...
@@ -2646,14 +2583,23 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2646 2583
   
2647 2584
   # correct ranges if within the promoterRange from the chr. starts and ends
2648 2585
   query = GenomicRanges::trim(query)
2586
+
2587
+  subject = GRN@annotation$genes %>%
2588
+      dplyr::select(-.data$gene.mean, -.data$gene.median, -.data$gene.CV) %>%
2589
+      dplyr::filter(!is.na(.data$gene.start), !is.na(.data$gene.end))
2649 2590
   
2650
-  subject = .getKnownGeneAnnotationNew(GRN, gene.types = gene.types, extendRegions = NULL)
2591
+  if (!is.null(gene.types)) {
2592
+      if (! "all" %in% gene.types) {
2593
+          subject = dplyr::filter(subject, .data$gene.type %in% gene.types)
2594
+      }
2595
+  }
2596
+      
2651 2597
   
2652 2598
   requiredColnames = c("gene.ENSEMBL","gene.type", "gene.name")
2653 2599
   checkmate::assertSubset(requiredColnames, colnames(subject), empty.ok = FALSE)
2654 2600
   
2655 2601
   subject.gr = GenomicRanges::makeGRangesFromDataFrame(subject, keep.extra.columns = TRUE)
2656
-  
2602
+
2657 2603
   if (overlapTypeGene == "TSS") {
2658 2604
     # Take only the 5' end of the gene (start site and NOT the full gene length)
2659 2605
     end(subject.gr) = start(subject.gr)
... ...
@@ -2725,16 +2671,12 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2725 2671
   
2726 2672
   genomeAssembly = GRN@config$parameters$genomeAssembly
2727 2673
   
2728
-  # checkmate::assertSubset(gene.types, c("all", names(.getAllGeneTypesAndFrequencies(GRN@config$parameters$genomeAssembly, verbose = FALSE))), empty.ok = FALSE)
2729
-  
2674
+
2730 2675
   if (is.null(nCores)) {
2731 2676
     nCores = 1
2732 2677
   }
2733 2678
   
2734
-  
2735
-  #consensusPeaks = .createDF_fromCoordinates(getCounts(GRN, type = "peaks", norm = TRUE, 0)$peakID, "peakID")
2736
-  
2737
-  consensusPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!.data$isFiltered)
2679
+  consensusPeaks = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered)
2738 2680
   # Preprocess TAD boundaries
2739 2681
   if (!is.null(TADs)) {
2740 2682
     
... ...
@@ -2853,12 +2795,12 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2853 2795
   
2854 2796
   permIndex = as.character(perm)
2855 2797
   
2856
-  countsPeaks.clean = dplyr::select(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE), -.data$peakID)
2857
-  countsRNA.clean  = dplyr::select(getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm)), -.data$ENSEMBL)
2798
+  countsPeaks.clean = getCounts(GRN, type = "peaks",  permuted = FALSE, includeIDColumn = FALSE)
2799
+  countsRNA.clean   = getCounts(GRN, type = "rna", permuted = as.logical(perm), includeIDColumn = FALSE)
2858 2800
   
2859 2801
   # Cleverly construct the count matrices so we do the correlations in one go
2860
-  map_peaks = match(overlaps.sub.filt.df$peak.ID,  getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID)
2861
-  map_rna  = match(overlaps.sub.filt.df$gene.ENSEMBL, getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))$ENSEMBL) # may contain NA values because the gene is not actually in the RNA-seq counts
2802
+  map_peaks = match(overlaps.sub.filt.df$peak.ID,  getCounts(GRN, type = "peaks", permuted = FALSE)$peakID)
2803
+  map_rna  = match(overlaps.sub.filt.df$gene.ENSEMBL, getCounts(GRN, type = "rna", permuted = as.logical(perm))$ENSEMBL) # may contain NA values because the gene is not actually in the RNA-seq counts
2862 2804
   
2863 2805
   # There should not b any NA because it is about the peaks
2864 2806
   stopifnot(all(!is.na(map_peaks)))
... ...
@@ -2901,8 +2843,8 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2901 2843
   
2902 2844
   # Make data frame and adjust p-values
2903 2845
   res.df = suppressMessages(tibble::as_tibble(res.m) %>%
2904
-                              dplyr::mutate(peak.ID = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID[map_peaks],
2905
-                                            gene.ENSEMBL = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))$ENSEMBL[map_rna]) %>%
2846
+                              dplyr::mutate(peak.ID = getCounts(GRN, type = "peaks", permuted = FALSE)$peakID[map_peaks],
2847
+                                            gene.ENSEMBL = getCounts(GRN, type = "rna", permuted = as.logical(perm))$ENSEMBL[map_rna]) %>%
2906 2848
                               dplyr::filter(!is.na(.data$gene.ENSEMBL)) %>%  # For some peak-gene combinations, no RNA-Seq data was available, these NAs are filtered
2907 2849
                               # Add gene annotation and distance
2908 2850
                               dplyr::left_join(overlaps.sub.filt.df, by = c("gene.ENSEMBL", "peak.ID")) %>%
... ...
@@ -2999,9 +2941,9 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2999 2941
       # TODO: perm and GRN not known here
3000 2942
       stop("Not implemeneted yet")
3001 2943
       # g = ggplot2::ggplot(dataCur.df, ggplot2::aes(data1, data2)) + ggplot2::geom_point() + ggplot2::geom_smooth(method ="lm") + 
3002
-      #   ggplot2::ggtitle(paste0(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID[map1[i]], 
2944
+      #   ggplot2::ggtitle(paste0(getCounts(GRN, type = "peaks", permuted = FALSE)$peakID[map1[i]], 
3003 2945
       #                  ", ", 
3004
-      #                  getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))$ENSEMBL[map2 [i]], 
2946
+      #                  getCounts(GRN, type = "rna", permuted = as.logical(perm))$ENSEMBL[map2 [i]], 
3005 2947
       #                  ", p = ", round(res$p.value, 3))) + 
3006 2948
       #   ggplot2::theme_bw()
3007 2949
       # plot(g)
... ...
@@ -3028,8 +2970,9 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
3028 2970
 #' First, filtering both TF-peak and peak-gene connections according to different criteria such as FDR and other properties 
3029 2971
 #' Second, joining the three major elements that an eGRN consist of (TFs, peaks, genes) into one data frame, with one row per unique TF-peak-gene connection.
3030 2972
 #' \strong{After successful execution, the connections (along with additional feature metadata) can be retrieved with the function \code{\link{getGRNConnections}}.}
3031
-#' \strong{Note that the eGRN graph is reset upon successful execution of this function, and re-running the function \code{\link{build_eGRN_graph}} 
3032
-#' is necessary when any of the network functions of the package shall be executed.}
2973
+#' \strong{Note that a previously stored eGRN graph is reset upon successful execution of this function along with printing a descriptive warning,
2974
+#'  and re-running the function \code{\link{build_eGRN_graph}} is necessary when any of the network functions of the package shall be executed. 
2975
+#' If the filtered connections changed, all network related enrichment functions also have to be rerun.}
3033 2976
 
3034 2977
 #' Internally, before joining them, both TF-peak links and peak-gene connections are filtered separately for reasons of memory and computational efficacy:
3035 2978
 #' First filtering out unwanted links dramatically reduces the memory needed for the full eGRN. Peak-gene p-value adjustment is only done after all filtering steps on the remaining set of
... ...
@@ -3057,6 +3000,8 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
3057 3000
 #' @param filterPeaks Character vector. Default \code{NULL}. Vector of peak IDs (as named in the GRN object) to retain. All peaks not listed will be filtered out.
3058 3001
 #' @param TF_peak_FDR_selectViaCorBins \code{TRUE} or \code{FALSE}.  Default \code{FALSE}. Use a modified procedure for selecting TF-peak links that is based on the user-specified FDR but that retains also links that may have a higher FDR but a more extreme correlation.
3059 3002
 #' @param silent \code{TRUE} or \code{FALSE}.  Default \code{FALSE}. Print progress messages and filter statistics.
3003
+#' @param resetGraphAndStoreInternally \code{TRUE} or \code{FALSE}.  Default \code{TRUE}. If set to \code{TRUE}, the stored eGRN graph slot graph) is reset due to the potentially changed connections that
3004
+#' would otherwise cause conflicts in the information stored in the object. Also, a GRN object is returned. If set to \code{FALSE}, only the new filtered connections are returned and the object is not altered.
3060 3005
 #' @param filterLoops  \code{TRUE} or \code{FALSE}. Default \code{TRUE}. If a TF regulates itself (i.e., the TF and the gene are the same entity), should such loops be filtered from the GRN?
3061 3006
 #' @template outputFolder
3062 3007
 #' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function. 
... ...
@@ -3090,6 +3035,7 @@ filterGRNAndConnectGenes <- function(GRN,
3090 3035
                                      TF_peak_FDR_selectViaCorBins = FALSE,
3091 3036
                                      filterLoops = TRUE,
3092 3037
                                      outputFolder = NULL,
3038
+                                     resetGraphAndStoreInternally = TRUE,
3093 3039
                                      silent = FALSE) {
3094 3040
   
3095 3041
   start = Sys.time()  
... ...
@@ -3112,6 +3058,7 @@ filterGRNAndConnectGenes <- function(GRN,
3112 3058
   
3113 3059
   checkmate::assert(checkmate::checkIntegerish(peak_gene.IHW.nbins, lower = 1), checkmate::checkSubset(peak_gene.IHW.nbins, "auto"))
3114 3060
   
3061
+  checkmate::assertFlag(resetGraphAndStoreInternally)
3115 3062
   checkmate::assertFlag(silent)
3116 3063
   checkmate::assertChoice(peak_gene.selection, c("all", "closest"))
3117 3064
   checkmate::assertFlag(allowMissingTFs)
... ...
@@ -3490,19 +3437,25 @@ filterGRNAndConnectGenes <- function(GRN,
3490 3437
     
3491 3438
   } #end for all permutations
3492 3439
   
3493
-  if (!is.null(GRN@graph)) {
3494
-      futile.logger::flog.info(paste0("Reset the graph slot in the object. Rerun the method build_eGRN_graph."))
3495
-      GRN@graph = list()
3440
+  
3441
+  if (!is.null(GRN@graph) & resetGraphAndStoreInternally) {
3442
+    message = "To avoid object inconsistencies and unexpected/non-reproducible results, the graph slot in the object has been reset. For all network-related functions as well as eGRN visualization, rerun the method build_eGRN_graph and all other network-related ans enrichment functions to update to the new set of filtered connections"
3443
+    .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
3444
+    GRN@graph = list()
3496 3445
   }
3497 3446
  
3498 3447
   
3499 3448
   
3500 3449
   if (silent) futile.logger::flog.threshold(futile.logger::INFO)
3501 3450
   
3502
-  .printExecutionTime(start)
3503
-  
3504
-  GRN
3451
+  if (!silent) .printExecutionTime(start)
3505 3452
   
3453
+  if (!resetGraphAndStoreInternally) {
3454
+    return(GRN@connections$all.filtered)
3455
+  } else {
3456
+    return(GRN)
3457
+  }
3458
+ 
3506 3459
 }
3507 3460
 
3508 3461
 
... ...
@@ -3725,10 +3678,10 @@ add_TF_gene_correlation <- function(GRN, corMethod = "pearson", addRobustRegress
3725 3678
         futile.logger::flog.info(paste0("  Iterate through ", maxRow, " TF-gene combinations and (if possible) calculate correlations using ", nCores, " cores. This may take a few minutes."))
3726 3679
         
3727 3680
         
3728
-        countsRNA.clean  = dplyr::select(getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur)), -.data$ENSEMBL)
3681
+        countsRNA.clean  = getCounts(GRN, type = "rna",  permuted = as.logical(permutationCur), includeIDColumn = FALSE)
3729 3682
         
3730
-        map_TF =   match(TF_genePairs$TF.ENSEMBL, getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))$ENSEMBL)
3731
-        map_gene = match(TF_genePairs$gene.ENSEMBL, getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))$ENSEMBL)
3683
+        map_TF =   match(TF_genePairs$TF.ENSEMBL, getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL)
3684
+        map_gene = match(TF_genePairs$gene.ENSEMBL, getCounts(GRN, type = "rna",permuted = as.logical(permutationCur))$ENSEMBL)
3732 3685
         
3733 3686
         # Some NAs might be expected, given our annotation contains all known genes
3734 3687
         stopifnot(!all(is.na(map_TF)))
... ...
@@ -3760,8 +3713,8 @@ add_TF_gene_correlation <- function(GRN, corMethod = "pearson", addRobustRegress
3760 3713
         
3761 3714
         # Make data frame and adjust p-values
3762 3715
         res.df = suppressMessages(tibble::as_tibble(res.m) %>%
3763
-                                    dplyr::mutate(TF.ENSEMBL   = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))$ENSEMBL[map_TF],
3764
-                                                  gene.ENSEMBL = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))$ENSEMBL[map_gene]) %>%
3716
+                                    dplyr::mutate(TF.ENSEMBL   = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL[map_TF],
3717
+                                                  gene.ENSEMBL = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL[map_gene]) %>%
3765 3718
                                     dplyr::filter(!is.na(.data$gene.ENSEMBL), !is.na(.data$TF.ENSEMBL)) %>%  # For some peak-gene combinations, no RNA-Seq data was available, these NAs are filtered
3766 3719
                                     dplyr::left_join(GRN@annotation$TFs, by = c("TF.ENSEMBL")) %>%
3767 3720
                                     dplyr::select(tidyselect::all_of(selectColumns))) %>%
... ...
@@ -3927,7 +3880,8 @@ addSNPOverlap <- function(grn, SNPData, col_chr = "chr", col_pos = "pos", col_pe
3927 3880
 
3928 3881
 #' Generate a summary for the number of connections for different filtering criteria for a \code{\linkS4class{GRN}} object. 
3929 3882
 #' 
3930
-#' This functions calls \code{\link{filterGRNAndConnectGenes}} repeatedly and stores the total number of connections and other statistics each time to summarize them afterwards. All arguments are identical to the ones in \code{\link{filterGRNAndConnectGenes}}, see the help for this function for details.
3883
+#' This functions calls \code{\link{filterGRNAndConnectGenes}} repeatedly and stores the total number of connections and other statistics each time to summarize them afterwards. 
3884
+#' All arguments are identical to the ones in \code{\link{filterGRNAndConnectGenes}}, see the help for this function for details.
3931 3885
 #' The function \code{\link{plot_stats_connectionSummary}} can be used afterwards for plotting.
3932 3886
 #' 
3933 3887
 #' @export
... ...
@@ -3942,6 +3896,7 @@ addSNPOverlap <- function(grn, SNPData, col_chr = "chr", col_pos = "pos", col_pe
3942 3896
 #' @param allowMissingTFs Logical vector of length 1 or 2. Default \code{c(FALSE)}. Allow TFs to be missing for TF-peak connections?  If both \code{FALSE} and \code{TRUE} are given, the code loops over both
3943 3897
 #' @template forceRerun
3944 3898
 #' @seealso \code{\link{plot_stats_connectionSummary}}
3899
+#' @seealso \code{\link{filterGRNAndConnectGenes}}
3945 3900
 #' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function.
3946 3901
 #' @examples 
3947 3902
 #' # See the Workflow vignette on the GRaNIE website for examples
... ...
@@ -3957,8 +3912,7 @@ generateStatsSummary <- function(GRN,
3957 3912
                                  gene.types = c("protein_coding"),
3958 3913
                                  allowMissingGenes = c(FALSE, TRUE),
3959 3914
                                  allowMissingTFs = c(FALSE),
3960
-                                 forceRerun = FALSE
3961
-) {
3915
+                                 forceRerun = FALSE) {
3962 3916
   
3963 3917
   start = Sys.time()   
3964 3918
   
... ...
@@ -4033,7 +3987,7 @@ generateStatsSummary <- function(GRN,
4033 3987
                 futile.logger::flog.debug(paste0("    TF_peak.connectionType = ", TF_peak.connectionTypeCur))
4034 3988
                 
4035 3989
                 futile.logger::flog.threshold(futile.logger::WARN)
4036
-                GRN = filterGRNAndConnectGenes(GRN, 
3990
+                con.filt.l = filterGRNAndConnectGenes(GRN, 
4037 3991
                                                TF_peak.fdr.threshold = TF_peak.fdr_cur, 
4038 3992
                                                TF_peak.connectionTypes = TF_peak.connectionTypeCur, 
4039 3993
                                                peak_gene.p_raw.threshold = NULL, 
... ...
@@ -4043,12 +3997,12 @@ generateStatsSummary <- function(GRN,
4043 3997
                                                allowMissingTFs = allowMissingTFsCur,
4044 3998
                                                peak_gene.r_range = peak_gene.r_range,
4045 3999
                                                filterTFs = NULL, filterGenes = NULL, filterPeaks = NULL,
4046
-                                               silent = FALSE)
4000
+                                               resetGraphAndStoreInternally = FALSE,
4001
+                                               silent = TRUE)
4047 4002
                 
4048 4003
                 futile.logger::flog.threshold(futile.logger::INFO)
4049 4004
                 
4050
-                
4051
-                results.l = .addStats(GRN@stats$connections, GRN@connections$all.filtered[[permIndex]], 
4005
+                results.l = .addStats(GRN@stats$connections, con.filt.l[[permIndex]], 
4052 4006
                                       perm = permutationCur, 
4053 4007
                                       TF_peak.fdr = TF_peak.fdr_cur, TF_peak.connectionType = TF_peak.connectionTypeCur,
4054 4008
                                       peak_gene.p_raw = NA,
... ...
@@ -4245,6 +4199,7 @@ generateStatsSummary <- function(GRN,
4245 4199
 loadExampleObject <- function(forceDownload = FALSE, fileURL = "https://www.embl.de/download/zaugg/GRaNIE/GRN.rds") {
4246 4200
   
4247 4201
   checkmate::assertFlag(forceDownload)
4202
+  options(timeout=200)
4248 4203
     
4249 4204
   if (!is.installed("BiocFileCache")) {
4250 4205
       
... ...
@@ -4304,20 +4259,18 @@ loadExampleObject <- function(forceDownload = FALSE, fileURL = "https://www.embl
4304 4259
 #' 
4305 4260
 #' @template GRN 
4306 4261
 #' @param type Character. Either \code{peaks} or \code{rna}. \code{peaks} corresponds to the counts for the open chromatin data, while \code{rna} refers to th RNA-seq counts. If set to \code{rna}, both permuted and non-permuted data can be retrieved, while for \code{peaks}, only the non-permuted one (i.e., 0) can be retrieved.
4307
-#' @param norm Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should original (often raw, but this may not necessarily be the case) or normalized counts be returned?
4308 4262
 #' @param asMatrix Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. If set to \code{FALSE}, counts are returned as a data frame with or without an ID column (see \code{includeIDColumn}). If set to \code{TRUE}, counts are returned as a matrix with the ID column as row names.
4309 4263
 #' @param includeIDColumn Logical. \code{TRUE} or \code{FALSE}. Default \code{TRUE}. Only relevant if \code{asMatrix = FALSE}. If set to \code{TRUE}, an explicit ID column is returned (no row names). If set to \code{FALSE}, the IDs are in the row names instead.
4264
+#' @param includeFiltered  Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. If set to \code{FALSE}, genes or peaks marked as filtered (after running the function \code{filterData}) will not be returned. If set to \code{TRUE}, all elements are returned regardless of the currently active filter status.
4310 4265
 #' @template permuted
4311 4266
 #' @export
4312 4267
 #' @import tibble
4313 4268
 #' @examples 
4314 4269
 #' # See the Workflow vignette on the GRaNIE website for examples
4315 4270
 #' GRN = loadExampleObject()
4316
-#' counts.df = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
4271
+#' counts.df = getCounts(GRN, type = "peaks", permuted = FALSE)
4317 4272
 #' @return Data frame of counts, with the type as indicated by the function parameters. This function does **NOT** return a \code{\linkS4class{GRN}} object.
4318
-
4319
-# TODO document and implement two new arguments here
4320
-getCounts <- function(GRN, type, norm = FALSE, permuted = FALSE, asMatrix = FALSE, includeIDColumn = TRUE) {
4273
+getCounts <- function(GRN, type,  permuted = FALSE, asMatrix = FALSE, includeIDColumn = TRUE, includeFiltered = FALSE) {
4321 4274
   
4322 4275
   checkmate::assertClass(GRN, "GRN")
4323 4276
   GRN = .addFunctionLogToObject(GRN)     
... ...
@@ -4325,92 +4278,86 @@ getCounts <- function(GRN, type, norm = FALSE, permuted = FALSE, asMatrix = FALS
4325 4278
   GRN = .makeObjectCompatible(GRN)
4326 4279
   
4327 4280
   checkmate::assertChoice(type, c("peaks", "rna"))
4328
-  checkmate::assertFlag(norm)#
4329 4281
   checkmate::assertFlag(permuted)
4330 4282
   
4331 4283
   permIndex = dplyr::if_else(permuted, "1", "0")
4332 4284
   
4333
-  if (!permuted) {
4334
-    if (type == "peaks") {
4335
-      
4336
-      if (norm ) {
4337
-        result = GRN@data$peaks$counts_norm
4338
-        
4339
-      } else {
4340
-            # Handle case when this is null due to already norm. counts as input
4341
-          
4342
-           # TODO: Unify the annotation column 
4343
-            classPeaks = class(GRN@data$peaks$counts_orig)
4344
-            if ("DESeqDataSet" %in% classPeaks) {
4345
-              result = DESeq2::counts(GRN@data$peaks$counts_orig, norm = FALSE) %>% as.data.frame() %>% tibble::rownames_to_column("peakID")
4346
-            } else if ("matrix" %in% classPeaks) {
4347
-                result = GRN@data$peaks$counts_orig %>% dplyr::select(-.data$isFiltered) %>% as.data.frame()
4348
-            } else if ("dgCMatrix" %in% classPeaks) {
4349
-                result = .asMatrixFromSparse(GRN@data$peaks$counts_orig, convertZero_to_NA = FALSE) %>% dplyr::select(-.data$isFiltered) %>% as.data.frame()
4350
-            } else {
4351
-                message = paste0("Unsupported class for GRN@data$peaks$counts_orig. Contact the authors.")
4352
-                .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4353
-            }
4354
-            
4355
-        }
4356
-
4357
-      
4358
-    } else if (type == "rna") {
4359
-      
4360
-      if (norm) {
4361
-            result = GRN@data$RNA$counts_norm.l[[permIndex]]
4362
-        
4363
-      } else {
4364
-            # TODO: Unify the annotation column 
4365
-            classRNA = class(GRN@data$RNA$counts_orig)
4366
-            if ("DESeqDataSet" %in% classRNA) {
4367
-                result = DESeq2::counts(GRN@data$RNA$counts_orig, norm = FALSE) %>% 
4368
-                    as.data.frame() %>% tibble::rownames_to_column("ENSEMBL")
4369
-            } else if ("matrix" %in% classRNA) {
4370
-                result = GRN@data$RNA$counts_orig %>% 
4371
-                    dplyr::select(-.data$isFiltered) %>% as.data.frame()
4372
-            } else if ("dgCMatrix" %in% classRNA) {
4373
-                result = .asMatrixFromSparse(GRN@data$RNA$counts_orig, convertZero_to_NA = FALSE) %>% 
4374
-                    dplyr::select(-.data$isFiltered) %>% as.data.frame() %>% tibble::rownames_to_column("ENSEMBL")
4375
-            } else {
4376
-                message = paste0("Unsupported class for GRN@data$RNA$counts_orig. Contact the authors.")
4377
-                .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4378
-            }
4379
-        
4380
-      }
4381
-        
4382
-    }
4383
-      
4384
-  } else { # permutation 1
4285
+  if (type == "peaks") {
4385 4286
     
4386
-    if (type == "peaks") {
4287
+    if (permuted) {
4387 4288
       message = "Could not find permuted peak counts in GRN object. Peaks are not stored as permuted, set permuted = FALSE for type = \"peaks\""
4388 4289
       .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4290
+    }
4291
+    
4292
+    classPeaks = class(GRN@data$peaks$counts)
4293
+    if ("matrix" %in% classPeaks) {
4294
+      result = GRN@data$peaks$counts
4295
+    } else if ("dgCMatrix" %in% classPeaks) {
4296
+      result = .asMatrixFromSparse(GRN@data$peaks$counts, convertZero_to_NA = FALSE)
4297
+    } else {
4298
+      message = paste0("Unsupported class for GRN@data$peaks$counts. Contact the authors.")
4299
+      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4300
+    }
4301
+    
4302
+    
4303
+  } else if (type == "rna") {
4304
+    
4305
+    classRNA = class(GRN@data$RNA$counts)
4306
+    if ("matrix" %in% classRNA) {
4307
+      result = GRN@data$RNA$counts
4308
+    } else if ("dgCMatrix" %in% classRNA) {
4309
+      result = .asMatrixFromSparse(GRN@data$RNA$counts, convertZero_to_NA = FALSE)
4389 4310
     } else {
4311
+      message = paste0("Unsupported class for GRN@data$RNA$counts. Contact the authors.")
4312
+      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4313
+    }
4314
+    
4315
+  }
4316
+  
4317
+  
4318
+  if (permuted) {
4319
+    
4320
+    if (type == "rna") {
4390 4321
       
4391
-      if (norm) {
4392
-        shuffledColumnOrder = GRN@data$RNA$counts_norm.l[[permIndex]]
4393
-        
4394
-        # Columns are shuffled so that non-matching samples are compared throughout the pipeline for all correlation-based analyses
4395
-        result = GRN@data$RNA$counts_norm.l[["0"]][shuffledColumnOrder] %>%
4396
-          dplyr::select(.data$ENSEMBL, tidyselect::everything())
4397
-        
4398
-      } else {
4399
-        message= "Could not find counts in object."
4400
-        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4401
-      }
4322
+      # Columns are shuffled so that non-matching samples are compared throughout the pipeline for all correlation-based analyses
4323
+      colnames(result) = colnames(result)[GRN@data$RNA$counts_permuted_index]
4402 4324
     }
4325
+  } 
4326
+  
4327
+  if (!includeFiltered) {
4328
+    
4329
+    if (type == "rna") {
4330
+      nonFiltered = which(! GRN@data$RNA$counts_metadata$isFiltered)
4331
+    } else {
4332
+      nonFiltered = which(! GRN@data$peaks$counts_metadata$isFiltered)
4333
+    }
4334
+    
4335
+    result = result[nonFiltered,]
4403 4336
   }
4337
+
4338
+  
4404 4339
   
4405
-  if ("isFiltered" %in% colnames(result)) {
4406
-    result = dplyr::filter(result, !.data$isFiltered) %>% dplyr::select(-.data$isFiltered)
4340
+  if (!asMatrix) {
4341
+ 
4342
+    result.df =  result %>%
4343
+      as.data.frame()
4344
+    
4345
+    if (includeIDColumn)  {
4346
+      
4347
+      ID_column = dplyr::if_else(type == "rna", "ENSEMBL", "peakID")
4348
+      
4349
+      result.df = result.df %>%
4350
+        tibble::rownames_to_column(ID_column) 
4351
+    } 
4352
+    
4353
+    return(result.df)
4354
+
4407 4355
   }
4408 4356
   
4409 4357
   result
4410 4358
 }
4411 4359
 
4412 4360
 
4413
-
4414 4361
 #' Extract connections or links from a \code{\linkS4class{GRN}} object as da data frame.
4415 4362
 #' 
4416 4363
 #' Returns stored connections/links (either TF-peak, peak-genes or the filtered set of connections as produced by \code{\link{filterGRNAndConnectGenes}}). 
... ...
@@ -4479,7 +4426,7 @@ getGRNConnections <- function(GRN, type = "all.filtered",  permuted = FALSE,
4479 4426
     if (include_variancePartitionResults) {
4480 4427
         
4481 4428
         if (ncol(GRN@annotation$genes %>% dplyr::select(tidyselect::starts_with("variancePartition"))) == 0 |
4482
-            ncol(GRN@annotation$consensusPeaks %>% dplyr::select(tidyselect::starts_with("variancePartition"))) == 0) {
4429
+            ncol(GRN@annotation$peaks %>% dplyr::select(tidyselect::starts_with("variancePartition"))) == 0) {
4483 4430
             message = "Please run the function add_featureVariation first. "
4484 4431
             .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4485 4432
         }
... ...
@@ -4491,7 +4438,7 @@ getGRNConnections <- function(GRN, type = "all.filtered",  permuted = FALSE,
4491 4438
             dplyr::left_join(GRN@annotation$genes %>% 
4492 4439
                                  dplyr::select(.data$gene.ENSEMBL, tidyselect::starts_with("gene.variancePartition")), 
4493 4440
                              by = "gene.ENSEMBL")  %>%
4494
-            dplyr::left_join(GRN@annotation$consensusPeaks %>% 
4441
+            dplyr::left_join(GRN@annotation$peaks %>% 
4495 4442
                                  dplyr::select(.data$peak.ID, tidyselect::starts_with("peak.variancePartition")), 
4496 4443
                              by = "peak.ID")
4497 4444
         
... ...
@@ -4508,10 +4455,10 @@ getGRNConnections <- function(GRN, type = "all.filtered",  permuted = FALSE,
4508 4455
     
4509 4456
     if (include_peakMetadata) {
4510 4457
         
4511
-        colsMissing = setdiff(colnames(GRN@annotation$consensusPeaks), colnames(merged.df))
4458
+        colsMissing = setdiff(colnames(GRN@annotation$peaks), colnames(merged.df))
4512 4459
         if (length(colsMissing) > 0) {
4513 4460
             merged.df = merged.df %>%
4514
-                dplyr::left_join(GRN@annotation$consensusPeaks, by = "peak.ID")
4461
+                dplyr::left_join(GRN@annotation$peaks, by = "peak.ID")
4515 4462
         }
4516 4463
     }
4517 4464
     
... ...
@@ -4809,27 +4756,27 @@ getBasic_metadata_visualization <- function(GRN, forceRerun = FALSE) {
4809 4756
   
4810 4757
   if (is.null(GRN@visualization$metadata) | forceRerun) {
4811 4758
     
4812
-    expMeans.m = getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE) %>% dplyr::select(-.data$ENSEMBL) %>% as.matrix()
4759
+    expMeans.m = getCounts(GRN, type = "rna", permuted = FALSE, asMatrix = TRUE)
4813 4760
     
4814 4761
     baseMean = rowMeans(expMeans.m)
4815
-    expression.df = tibble::tibble(ENSEMBL_ID = getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE)$ENSEMBL, baseMean = baseMean) %>%
4762
+    expression.df = tibble::tibble(ENSEMBL_ID = getCounts(GRN, type = "rna", permuted = FALSE, includeIDColumn = TRUE)$ENSEMBL, baseMean = baseMean) %>%
4816 4763
       dplyr::mutate(ENSEMBL_ID = gsub("\\..+", "", .data$ENSEMBL_ID, perl = TRUE),
4817 4764
                     baseMean_log = log2(baseMean + 0.01))
4818 4765
     
4819 4766
     expression_TF.df = dplyr::filter(expression.df, .data$ENSEMBL_ID %in% GRN@annotation$TFs$TF.ENSEMBL) %>%
4820 4767
       dplyr::left_join(GRN@annotation$TFs, by = c("ENSEMBL_ID" = "TF.ENSEMBL"))
4821 4768
     
4822
-    meanPeaks.df = tibble::tibble(peakID = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID, 
4823
-                                  mean = rowMeans(dplyr::select(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE), -.data$peakID))) %>%
4769
+    meanPeaks.df = tibble::tibble(peakID = getCounts(GRN, type = "peaks", permuted = FALSE)$peakID, 
4770
+                                  mean = rowMeans(getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = TRUE))) %>%
4824 4771
       dplyr::mutate(mean_log = log2(mean + 0.01))
4825 4772
     
4826
-    metadata = list("RNA_expression_genes" = expression.df,
4773
+    GRN@visualization$metadata = list("RNA_expression_genes" = expression.df,
4827 4774
                     "RNA_expression_TF"    = expression_TF.df,
4828 4775
                     "Peaks_accessibility"   = meanPeaks.df)
4829 4776
     
4830 4777
   } 
4831 4778
   
4832
-  metadata
4779
+  GRN
4833 4780
   
4834 4781
 }
4835 4782
 
... ...
@@ -5113,22 +5060,17 @@ add_featureVariation <- function (GRN,
5113 5060
             futile.logger::flog.info(paste0("Using ", length(peaksToInclude), " peaks and ", length(genesToInclude), " TF/genes from the filtered connections."))
5114 5061
             
5115 5062
         } else if (features == "all") {
5116
-            genesToInclude = GRN@data$RNA$counts_norm.l[["0"]]$ENSEMBL
5117
-            peaksToInclude = GRN@data$peaks$counts_norm$peakID
5063
+            genesToInclude = GRN@data$RNA$counts_metadata$ENSEMBL
5064
+            peaksToInclude = GRN@data$peaks$counts_metadata$peakID
5118 5065
             
5119 5066
             futile.logger::flog.info(paste0("Using all ", length(peaksToInclude), " peaks and ", length(genesToInclude), " TF/genes. This may take a long time. Consider setting features = \"all_filtered\"."))
5120 5067
         }
5121 5068
         
5122 5069
         data.l = list()
5123
-        data.l[["RNA"]] <- GRN@data$RNA$counts_norm.l[["0"]] %>% 
5124
-            dplyr::filter(!.data$isFiltered, .data$ENSEMBL %in% genesToInclude) %>%
5125
-            dplyr::select(-.data$isFiltered) %>% 
5126
-            tibble::column_to_rownames(var = "ENSEMBL")
5127
-        
5128
-        data.l[["peaks"]] <- GRN@data$peaks$counts_norm %>% 
5129
-            dplyr::filter(!.data$isFiltered, .data$peakID %in% peaksToInclude) %>%
5130
-            dplyr::select(-.data$isFiltered) %>%
5131
-            tibble::column_to_rownames(var = "peakID")  
5070
+        #TODO
5071
+        data.l[["RNA"]] <- getCounts(GRN, type = "rna", permuted = FALSE, includeFiltered = FALSE, includeIDColumn = FALSE)
5072
+          
5073
+        data.l[["peaks"]] <-  getCounts(GRN, type = "peaks", permuted = FALSE, includeFiltered = FALSE, includeIDColumn = FALSE)
5132 5074
         
5133 5075
         
5134 5076
         for (dataType in c("RNA", "peaks")) {
... ...
@@ -5169,7 +5111,7 @@ add_featureVariation <- function (GRN,
5169 5111
                 
5170 5112
                 colnames(res)[2:ncol(res)] = paste0("peak.", colnames(res)[2:ncol(res)])
5171 5113
                 
5172
-                GRN@annotation$consensusPeaks = GRN@annotation$consensusPeaks %>%
5114
+                GRN@annotation$peaks = GRN@annotation$peaks %>%
5173 5115
                     dplyr::select(-tidyselect::starts_with("variancePart")) %>%
5174 5116
                     dplyr::left_join(res, by = c("peak.ID" = "ID"))
5175 5117
             }
... ...
@@ -5198,34 +5140,88 @@ add_featureVariation <- function (GRN,
5198 5140
     }
5199 5141
 }
5200 5142
 
5143
+# Converts from an older GRN object format to the most current one due to internal optimizations
5201 5144
 .makeObjectCompatible <- function(GRN) {
5202 5145
     
5203
-    if (is.null(GRN@annotation$TFs)) {
5146
+    if (is.null(GRN@annotation$TFs) & !is.null(GRN@data$TFs$translationTable)) {
5204 5147
         GRN@annotation$TFs = GRN@data$TFs$translationTable
5205 5148
         GRN@data$TFs$translationTable = NULL
5206 5149
     }
5207 5150
     
5208
-    if (!"TF.ENSEMBL" %in% colnames(GRN@annotation$TFs)) {
5209
-        GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.ENSEMBL = .data$ENSEMBL)
5210
-    }
5211
-    if (!"TF.HOCOID" %in% colnames(GRN@annotation$TFs)) {
5212
-        GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.HOCOID = .data$HOCOID)
5213
-    }
5151
+    if (!is.null(GRN@annotation$TFs)) {
5214 5152
     
5215
-    if (!"ENSEMBL" %in% colnames(GRN@annotation$TFs)) {
5216
-        GRN@annotation$TFs = dplyr::select(GRN@annotation$TFs, -.data$ENSEMBL)
5153
+        if (!"TF.ENSEMBL" %in% colnames(GRN@annotation$TFs)) {
5154
+          GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.ENSEMBL = .data$ENSEMBL)
5155
+        }
5156
+        if (!"TF.HOCOID" %in% colnames(GRN@annotation$TFs)) {
5157
+          GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.HOCOID = .data$HOCOID)
5158
+        }
5159
+        
5160
+        if ("ENSEMBL" %in% colnames(GRN@annotation$TFs)) {
5161
+          GRN@annotation$TFs = dplyr::select(GRN@annotation$TFs, -.data$ENSEMBL)
5162
+        }
5217 5163
     }
5218
-   
5219
-    
5164
+
5220 5165
     
5221 5166
     # Temporary fix: Replace lincRNA with lncRNA due to a recent change in biomaRt until we update the object directly
5222 5167
     if ("lncRNA" %in% levels(GRN@annotation$genes$gene.type)) {
5223 5168
         GRN@annotation$genes = dplyr::mutate(GRN@annotation$genes, gene.type = dplyr::recode_factor(.data$gene.type, lncRNA = "lincRNA")) 
5224 5169
     }
5225
-   
5226 5170
     
5171
+    if (is.null(GRN@annotation$peaks) & !is.null(GRN@annotation$consensusPeaks)) {
5172
+        GRN@annotation[["peaks"]] = GRN@annotation[["consensusPeaks"]]
5173
+        GRN@annotation[["consensusPeaks"]] = NULL
5174
+        GRN@annotation[["peaks_obj"]] = GRN@annotation[["consensusPeaks_obj"]]
5175
+        GRN@annotation[["consensusPeaks_obj"]] = NULL
5176
+        
5177
+    }
5178
+    
5179
+    # Renamed count slots and their structure
5180
+    # 1. peaks
5181
+    if (!is.null(GRN@data$peaks[["counts_orig"]])) {
5182
+        GRN@data$peaks[["counts_orig"]] = NULL
5183
+    }
5184
+    if (is.null(GRN@data$peaks[["counts"]])) {
5185
+        GRN@data$peaks[["counts"]] = .storeAsMatrixOrSparseMatrix(GRN, df = GRN@data$peaks$counts_norm %>% dplyr::select(-.data$isFiltered), 
5186
+                                                                  ID_column = "peakID", slotName = "GRN@data$peaks$counts")
5187
+    }
5188
+    if (!is.null(GRN@data$peaks[["counts_norm"]])) {
5189
+        GRN@data$peaks[["counts_norm"]] = NULL
5190
+    }
5191
+    
5192
+    if (is.null(GRN@data$peaks[["counts_metadata"]])) {
5193
+        GRN@data$peaks[["counts_metadata"]] = .createConsensusPeaksDF(GRN@data$peaks[["counts"]] %>% as.data.frame() %>% tibble::rownames_to_column("peakID")) 
5194
+        stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata))
5195
+    }
5196
+    if (!is.null(GRN@data$peaks[["consensusPeaks"]])) {
5197
+        GRN@data$peaks[["consensusPeaks"]] = NULL
5198
+    }
5199
+    
5200
+    # 2. RNA
5201
+    if (!is.null(GRN@data$RNA[["counts_orig"]])) {
5202
+        GRN@data$RNA[["counts_orig"]] = NULL
5203
+    }
5204
+    if (is.null(GRN@data$RNA[["counts"]]) & !is.null(GRN@data$RNA$counts_norm.l[["0"]])) {
5205
+        GRN@data$RNA[["counts"]] = .storeAsMatrixOrSparseMatrix(GRN, df = GRN@data$RNA$counts_norm.l[["0"]] %>% dplyr::select(-.data$isFiltered), 
5206
+                                                                ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts")
5207
+    }
5208
+    if (!is.null(GRN@data$RNA[["counts_norm.l"]])) {
5209
+        GRN@data$RNA[["counts_norm.l"]] = NULL
5210
+    }
5211
+    
5212
+    if (is.null(GRN@data$RNA[["counts_metadata"]]) & !is.null(GRN@data$RNA$counts)) {
5213
+        GRN@data$RNA[["counts_metadata"]] = tibble::tibble(ID = GRN@data$RNA$counts %>% as.data.frame() %>% tibble::rownames_to_column("ENSEMBL"), isFiltered = FALSE)
5214
+    }
5215
+    
5216
+    if (is.null(GRN@data$RNA[["counts_permuted_index"]]) & !is.null(GRN@data$RNA$counts)) {
5217
+        GRN@data$RNA[["counts_permuted_index"]] = .shuffleColumns(as.data.frame(GRN@data$RNA$counts), 
5218
+                                                             .getMaxPermutation(GRN), returnUnshuffled = FALSE, returnAsList = FALSE)
5219
+    }
5220
+
5227 5221
     
5228 5222
     GRN
5229 5223
 }
5230 5224
 
5225
+
5226
+
5231 5227
   
5232 5228
\ No newline at end of file
... ...
@@ -511,7 +511,7 @@
511 511
     
512 512
     output.global.TFs.merged = output.global.TFs %>%
513 513
       dplyr::filter(!!as.name(colnameClassificationCur) != "not-expressed")  %>%
514
-      dplyr::full_join(TF.specific, by = c( "TF" = "HOCOID"))  %>%
514
+      dplyr::full_join(TF.specific, by = c( "TF" = "TF.HOCOID"))  %>%
515 515
       dplyr::mutate(baseMeanNorm = (.data$baseMean - min(.data$baseMean, na.rm = TRUE)) / (max(.data$baseMean, na.rm = TRUE) - min(.data$baseMean, na.rm = TRUE)) + par.l$minPointSize)   %>%
516 516
       dplyr::filter(!is.na(!!as.name(colnameClassificationCur))) 
517 517
     
... ...
@@ -640,22 +640,22 @@
640 640
     futile.logger::flog.info(paste0("Plotting AR heatmap", dplyr::if_else(is.null(file), "", paste0(" to file ", file))))
641 641
     
642 642
     
643
-    missingGenes = which(!HOCOMOCO_mapping.df.exp$HOCOID %in% colnames(sort.cor.m))
643
+    missingGenes = which(!HOCOMOCO_mapping.df.exp$TF.HOCOID %in% colnames(sort.cor.m))
644 644
     if (length(missingGenes) > 0) {
645
-        HOCOMOCO_mapping.df.exp = dplyr::filter(HOCOMOCO_mapping.df.exp, .data$HOCOID %in% colnames(sort.cor.m))
645
+        HOCOMOCO_mapping.df.exp = dplyr::filter(HOCOMOCO_mapping.df.exp, .data$TF.HOCOID %in% colnames(sort.cor.m))
646 646
     }
647 647
     
648 648
     if (!is.null(file)) {
649 649
          grDevices::pdf(file, ...)
650 650
     }
651 651
     
652
-    cor.r.filt.m <- sort.cor.m[,as.character(HOCOMOCO_mapping.df.exp$HOCOID)]
652
+    cor.r.filt.m <- sort.cor.m[,as.character(HOCOMOCO_mapping.df.exp$TF.HOCOID)]
653 653
     
654
-    stopifnot(identical(colnames( cor.r.filt.m), as.character(HOCOMOCO_mapping.df.exp$HOCOID)))
654
+    stopifnot(identical(colnames( cor.r.filt.m), as.character(HOCOMOCO_mapping.df.exp$TF.HOCOID)))
655 655
     
656 656
     BREAKS = seq(-1,1,0.05)
657 657
     diffDensityMat = matrix(NA, nrow = ncol( cor.r.filt.m), ncol = length(BREAKS) - 1)
658
-    rownames(diffDensityMat) = HOCOMOCO_mapping.df.exp$HOCOID
658
+    rownames(diffDensityMat) = HOCOMOCO_mapping.df.exp$TF.HOCOID
659 659
     
660 660
     TF_Peak_all.m <- TF.peakMatrix.df
661 661
     TF_Peak.m <- TF_Peak_all.m
... ...
@@ -675,7 +675,7 @@
675 675
     
676 676
     ## check to what extent the number of TF motifs affects the density values
677 677
     n_min = dplyr::if_else(colSums(TF_Peak.m) < nrow(TF_Peak.m),colSums(TF_Peak.m), nrow(TF_Peak.m) - colSums(TF_Peak.m))
678
-    names(n_min) = HOCOMOCO_mapping.df.exp$HOCOID#[match(names(n_min), as.character(tf2ensg$ENSEMBL))]
678
+    names(n_min) = HOCOMOCO_mapping.df.exp$TF.HOCOID#[match(names(n_min), as.character(tf2ensg$ENSEMBL))]
679 679
     n_min <- sapply(split(n_min,names(n_min)),sum)
680 680
     
681 681
     # Make sure n_min and diffDenityMat are compatible because some NA rows may have been filtered out for diffDensityMat
... ...
@@ -336,7 +336,7 @@
336 336
     hashesStrError = paste0(paste0(rep("#", nchar(lastPartError) - 1), collapse = ""), "\n")
337 337
     messageError    = paste0(objectPart, checkResult, "\n\n", hashesStrError, lastPartError, hashesStrError)
338 338
     
339
-    lastPartWarning = ". \nThis warning may or may not be ignored. Carefully check its significance and whether it may affect the results."
339
+    lastPartWarning = ". \nThis warning may or may not be ignored. Carefully check its significance and whether it may affect the results.\n"
340 340
     #hashesStrWarning = paste0(paste0(rep("#", nchar(lastPartWarning) - 1), collapse = ""), "\n")
341 341
     messageWarning  = paste0(objectPart, checkResult, lastPartWarning) # , hashesStrWarning)
342 342
     
... ...
@@ -293,13 +293,13 @@ performAllNetworkAnalyses <- function(GRN, ontology = c("GO_BP", "GO_MF"),
293 293
 
294 294
   } else if (type == "all_RNA") {
295 295
     
296
-      backgroundGenes = GRN@data$RNA$counts_norm.l[["0"]] %>%
296
+      backgroundGenes = GRN@data$RNA$counts_metadata %>%
297 297
           dplyr::pull(.data$ENSEMBL)
298 298
     
299 299
     
300 300
   } else if (type == "all_RNA_filtered") {
301 301
       
302
-      backgroundGenes = GRN@data$RNA$counts_norm.l[["0"]] %>%
302
+      backgroundGenes = GRN@data$RNA$counts_metadata %>%
303 303
           dplyr::filter(!.data$isFiltered) %>%
304 304
           dplyr::pull(.data$ENSEMBL)
305 305
       
... ...
@@ -381,13 +381,13 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"),
381 381
   checkmate::assertChoice(pAdjustMethod, c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"))
382 382
   checkmate::assertFlag(forceRerun)
383 383
   
384
-  
384
+  futile.logger::flog.info(paste0("Calculating general enrichment. This may take a while"))
385 385
   
386 386
   
387 387
   mapping = .getGenomeObject(GRN@config$parameters$genomeAssembly, type = "packageName")
388 388
   backgroundGenes = .getBackgroundGenes(GRN, type = background, gene.types = background_geneTypes)
389 389
   
390
-  futile.logger::flog.info(paste0("Calculating general enrichment statistics. This may take a while"))
390
+ 
391 391
   
392 392
   if (is.null(GRN@stats[["Enrichment"]][["general"]])) {
393 393
     GRN@stats[["Enrichment"]][["general"]] = list()
... ...
@@ -480,7 +480,7 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"),
480 480
   
481 481
 }
482 482
 
483
-.checkEnrichmentCongruence_general_community <-function(GRN, type = "community") {
483
+.checkEnrichmentCongruence_general <-function(GRN, type = "community") {
484 484
   
485 485
   allOntologiesGeneral = sort(names(GRN@stats$Enrichment$general))
486 486
   
... ...
@@ -491,14 +491,21 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"),
491 491
   }
492 492
   
493 493
   if (!identical(allOntologiesGeneral, allOntologiesGroup1)) {
494
-    message = paste0("General enrichment and ", type, " enrichment do not have the same ontologies precalculated (",
495
-                     paste0(allOntologiesGeneral, collapse = " & "), " vs. ", paste0(allOntologiesGroup1, collapse = "&"), ")")
496
-    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
494
+    message = paste0("General enrichment and ", type, " enrichment do not have the same ontologies precalculated (\"",
495
+                     paste0(allOntologiesGeneral, collapse = " & "), "\" vs. \"", paste0(allOntologiesGroup1, collapse = "&"), "\").",
496
+                     "Run the function calculateGeneralEnrichment to eliminate this warning")
497
+    .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
497 498
   }
498
-  allOntologiesGeneral
499
+  
500
+  res.l = list(allOntologiesGroup1, allOntologiesGeneral)
501
+  names(res.l) = c(type, "all")
502
+  
503
+  res.l
499 504
   
500 505
 }
501 506
 
507
+
508
+
502 509
 #' @importFrom biomaRt useEnsembl getBM
503 510
 .runEnrichment <- function(GRN, foreground, background, backgroundStr, ontology, 
504 511
                            description = "Enrichment Analysis",
... ...
@@ -584,6 +591,19 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"),
584 591
   
585 592
   
586 593
   if (ontology %in% c("GO_BP","GO_MF","GO_CC")){
594
+      
595
+      # go_enrichment =  
596
+      #     clusterProfiler::enrichGO(
597
+      #         gene = foreground_entrez,
598
+      #         OrgDb = 'org.Hs.eg.db', 
599
+      #         ont = sub("GO_", "", ontology),
600
+      #         universe = background_entrez,
601
+      #         keyType = "ENTREZID",
602
+      #         pvalueCutoff = 1,
603
+      #         qvalueCutoff = 1,
604
+      #         minGSSize = minGSSize,
605
+      #         maxGSSize = maxGSSize,
606
+      #         pAdjustMethod = pAdjustMethod)
587 607
     
588 608
     go_enrichment = suppressMessages(new("topGOdata",
589 609
                                          ontology = gsub("GO_", "", ontology),
... ...
@@ -594,12 +614,23 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"),
594 614
                                          mapping = mapping, 
595 615
                                          ID = "ensembl"))
596 616
     
617
+    # retrieve genes2GO list from the "expanded" annotation in GOdata
618
+    allGO = topGO::genesInTerm(go_enrichment)
619
+    allGO_inForeground = lapply(allGO, function(x) paste0(x[x %in% foreground], collapse = ",")) %>%
620
+        as.data.frame() %>% t() 
621
+    
622
+    allGO_inForeground.df = tibble::tibble(ID = rownames(allGO_inForeground), gene.ENSEMBL_foreground = allGO_inForeground[, 1]) %>%
623
+        dplyr::mutate(ID = sub(".", ":", .data$ID, fixed = TRUE))
624
+        
625
+
626
+    
597 627
     result = suppressMessages(topGO::runTest(go_enrichment, algorithm = algorithm, statistic = statistic))
598 628
     # Dont trim GO terms here, happens later when plotting
599 629
     result.tbl = unique(topGO::GenTable(go_enrichment, pval = result, orderBy = "pval", numChar = 1000, 
600 630
                                         topNodes = length(topGO::score(result))) ) %>%
601 631
       dplyr::rename(ID = .data$GO.ID, Found = .data$Significant)  %>%      # make it more clear what Significant refers to here
602
-      dplyr::mutate(GeneRatio = .data$Found / nForeground)
632
+      dplyr::mutate(GeneRatio = .data$Found / nForeground) %>%
633
+      dplyr::left_join(allGO_inForeground.df, by = "ID")
603 634
     
604 635
     
605 636
     result.list[["results"]] = result.tbl
... ...
@@ -727,8 +758,10 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"),
727 758
   if (!is.null(enrichmentObj)) {
728 759
     
729 760
     result.tbl = enrichmentObj@result %>%
730
-      dplyr::rename(Term = .data$Description, pval = .data$pvalue, Found = .data$Count) %>%
731
-      dplyr::mutate(GeneRatio = sapply(parse (text = enrichmentObj@result$GeneRatio), eval))
761
+      dplyr::rename(Term = .data$Description, pval = .data$pvalue, Found = .data$Count) 
762
+    
763
+      # GeneRatio and BgRatio are reported as fractions like 5/83, leave it like this for now 
764
+      # %>% dplyr::mutate(GeneRatio = sapply(parse (text = enrichmentObj@result$GeneRatio), eval))
732 765
     
733 766
   } else {
734 767
     
... ...
@@ -1153,6 +1186,8 @@ calculateTFEnrichment <- function(GRN, rankType = "degree", n = 3, TF.names = NU
1153 1186
   checkmate::assertChoice(pAdjustMethod, c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"))
1154 1187
   checkmate::assertFlag(forceRerun)
1155 1188
   
1189
+  futile.logger::flog.info(paste0("Calculating TF enrichment. This may take a while"))
1190
+  
1156 1191
   if (rankType == "custom"){
1157 1192
     if(is.null(TF.names)){
1158 1193
       futile.logger::flog.error("To calculate the GO enrichment for a custom set of TFs, you must provide the TF names in the 'TF.names' parameter.")
1159 1194
new file mode 100644
... ...
@@ -0,0 +1,75 @@
1
+[2022-08-24 10:29:01] [INFO]    [OmnipathR] Initialized cache: `/home/christian/.cache/OmnipathR`.
2
+[2022-08-24 10:29:01] [INFO]    [OmnipathR] Welcome to OmnipathR!
3
+[2022-08-24 10:29:05] [INFO]    [OmnipathR] Cache record does not exist: `https://omnipathdb.org/annotations?resources=PROGENy&license=academic`
4
+[2022-08-24 10:29:05] [INFO]    [OmnipathR] Retrieving URL: `https://omnipathdb.org/annotations?resources=PROGENy&license=academic`
5
+[2022-08-24 10:29:05] [TRACE]   [OmnipathR] Attempt 1/3: `https://omnipathdb.org/annotations?resources=PROGENy&license=academic`
6
+[2022-08-24 10:29:09] [INFO]    [OmnipathR] Cache item `3492392585ba05414edfae46801492076437d7e7` version 1: status changed from `unknown` to `started`.
7
+[2022-08-24 10:29:09] [TRACE]   [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/3492392585ba05414edfae46801492076437d7e7-1.rds`.
8
+[2022-08-24 10:29:12] [TRACE]   [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/3492392585ba05414edfae46801492076437d7e7-1.rds`.
9
+[2022-08-24 10:29:12] [INFO]    [OmnipathR] Download ready [key=3492392585ba05414edfae46801492076437d7e7, version=1]
10
+[2022-08-24 10:29:12] [INFO]    [OmnipathR] Cache item `3492392585ba05414edfae46801492076437d7e7` version 1: status changed from `started` to `ready`.
11
+[2022-08-24 10:29:12] [SUCCESS] [OmnipathR] Downloaded 700257 annotation records.
12
+[2022-08-24 10:29:14] [INFO]    [OmnipathR] Loading database `Ensembl organism names`.
13
+[2022-08-24 10:29:14] [INFO]    [OmnipathR] Looking up in cache `https://www.ensembl.org/info/about/species.html`: key=7332486db7400730697234bad76ca0c8e4d00799, no version available.
14
+[2022-08-24 10:29:14] [INFO]    [OmnipathR] Created new version for cache record 7332486db7400730697234bad76ca0c8e4d00799: version 1.
15
+[2022-08-24 10:29:14] [TRACE]   [OmnipathR] Cache file path: /home/christian/.cache/OmnipathR/7332486db7400730697234bad76ca0c8e4d00799-1.html
16
+[2022-08-24 10:29:14] [INFO]    [OmnipathR] Retrieving URL: `https://www.ensembl.org/info/about/species.html`
17
+[2022-08-24 10:29:14] [TRACE]   [OmnipathR] Attempt 1/3: `https://www.ensembl.org/info/about/species.html`
18
+[2022-08-24 10:29:14] [TRACE]   [OmnipathR] HTTP 200
19
+[2022-08-24 10:29:14] [INFO]    [OmnipathR] Download ready [key=7332486db7400730697234bad76ca0c8e4d00799, version=1]
20
+[2022-08-24 10:29:14] [INFO]    [OmnipathR] Cache item `7332486db7400730697234bad76ca0c8e4d00799` version 1: status changed from `unknown` to `ready`.
21
+[2022-08-24 10:29:15] [INFO]    [OmnipathR] Loaded database `Ensembl organism names`.
22
+[2022-08-24 10:29:15] [TRACE]   [OmnipathR] Looking up in cache: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data`.
23
+[2022-08-24 10:29:15] [INFO]    [OmnipathR] Cache record does not exist: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data`
24
+[2022-08-24 10:29:15] [TRACE]   [OmnipathR] Could not find in cache, initiating download: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data`.
25
+[2022-08-24 10:29:15] [INFO]    [OmnipathR] Cache item `2ccc1998f0ab26ea421885e1334896d25db4554f` version 1: status changed from `unknown` to `started`.
26
+[2022-08-24 10:29:15] [TRACE]   [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/2ccc1998f0ab26ea421885e1334896d25db4554f-1.rds`.
27
+[2022-08-24 10:29:15] [INFO]    [OmnipathR] Retrieving URL: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data`
28
+[2022-08-24 10:29:15] [TRACE]   [OmnipathR] Attempt 1/3: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data`
29
+[2022-08-24 10:29:28] [TRACE]   [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/2ccc1998f0ab26ea421885e1334896d25db4554f-1.rds`.
30
+[2022-08-24 10:29:28] [INFO]    [OmnipathR] Download ready [key=2ccc1998f0ab26ea421885e1334896d25db4554f, version=1]
31
+[2022-08-24 10:29:28] [INFO]    [OmnipathR] Cache item `2ccc1998f0ab26ea421885e1334896d25db4554f` version 1: status changed from `started` to `ready`.
32
+[2022-08-24 10:29:28] [SUCCESS] [OmnipathR] NCBI HomoloGene (ftp.ncbi.nih.gov): downloaded 275237 records
33
+[2022-08-24 10:29:28] [TRACE]   [OmnipathR] ID translation table: from `genesymbol` to `uniprot`, using `uniprot`.
34
+[2022-08-24 10:29:28] [TRACE]   [OmnipathR] Creating ID mapping table from `genes(PREFERRED)` to `id`, for organism 9606 (only reviewed: TRUE)
35
+[2022-08-24 10:29:28] [TRACE]   [OmnipathR] Loading all UniProt records for organism 9606 (only reviewed: TRUE); fields: genes(PREFERRED),id
36
+[2022-08-24 10:29:28] [TRACE]   [OmnipathR] Looking up in cache: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no`.
37
+[2022-08-24 10:29:28] [INFO]    [OmnipathR] Cache record does not exist: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no`
38
+[2022-08-24 10:29:28] [TRACE]   [OmnipathR] Could not find in cache, initiating download: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no`.
39
+[2022-08-24 10:29:28] [INFO]    [OmnipathR] Cache item `5745c7e161d60f14153a3762e978cf882b30bf01` version 1: status changed from `unknown` to `started`.
40
+[2022-08-24 10:29:28] [TRACE]   [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/5745c7e161d60f14153a3762e978cf882b30bf01-1.rds`.
41
+[2022-08-24 10:29:28] [INFO]    [OmnipathR] Retrieving URL: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no`
42
+[2022-08-24 10:29:28] [TRACE]   [OmnipathR] Attempt 1/3: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no`
43
+[2022-08-24 10:29:35] [TRACE]   [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/5745c7e161d60f14153a3762e978cf882b30bf01-1.rds`.
44
+[2022-08-24 10:29:35] [INFO]    [OmnipathR] Download ready [key=5745c7e161d60f14153a3762e978cf882b30bf01, version=1]
45
+[2022-08-24 10:29:35] [INFO]    [OmnipathR] Cache item `5745c7e161d60f14153a3762e978cf882b30bf01` version 1: status changed from `started` to `ready`.
46
+[2022-08-24 10:29:35] [SUCCESS] [OmnipathR] UniProt (legacy.uniprot.org): downloaded 20398 records
47
+[2022-08-24 10:29:36] [TRACE]   [OmnipathR] ID translation table: from `genesymbol` to `uniprot`, using `uniprot`.
48
+[2022-08-24 10:29:36] [TRACE]   [OmnipathR] Creating ID mapping table from `genes(PREFERRED)` to `id`, for organism 10116 (only reviewed: TRUE)
49
+[2022-08-24 10:29:36] [TRACE]   [OmnipathR] Loading all UniProt records for organism 10116 (only reviewed: TRUE); fields: genes(PREFERRED),id
50
+[2022-08-24 10:29:36] [TRACE]   [OmnipathR] Looking up in cache: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no`.
51
+[2022-08-24 10:29:36] [INFO]    [OmnipathR] Cache record does not exist: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no`
52
+[2022-08-24 10:29:36] [TRACE]   [OmnipathR] Could not find in cache, initiating download: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no`.
53
+[2022-08-24 10:29:36] [INFO]    [OmnipathR] Cache item `bd4863670563624f4aff5469966f0ef06833d425` version 1: status changed from `unknown` to `started`.
54
+[2022-08-24 10:29:36] [TRACE]   [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/bd4863670563624f4aff5469966f0ef06833d425-1.rds`.
55
+[2022-08-24 10:29:36] [INFO]    [OmnipathR] Retrieving URL: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no`
56
+[2022-08-24 10:29:36] [TRACE]   [OmnipathR] Attempt 1/3: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no`
57
+[2022-08-24 10:29:47] [TRACE]   [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/bd4863670563624f4aff5469966f0ef06833d425-1.rds`.
58
+[2022-08-24 10:29:47] [INFO]    [OmnipathR] Download ready [key=bd4863670563624f4aff5469966f0ef06833d425, version=1]
59
+[2022-08-24 10:29:47] [INFO]    [OmnipathR] Cache item `bd4863670563624f4aff5469966f0ef06833d425` version 1: status changed from `started` to `ready`.
60
+[2022-08-24 10:29:47] [SUCCESS] [OmnipathR] UniProt (legacy.uniprot.org): downloaded 8160 records
61
+[2022-08-24 10:30:10] [TRACE]   [OmnipathR] ID translation table: from `uniprot` to `genesymbol`, using `uniprot`.
62
+[2022-08-24 10:30:10] [TRACE]   [OmnipathR] Creating ID mapping table from `id` to `genes(PREFERRED)`, for organism 10116 (only reviewed: TRUE)
63
+[2022-08-24 10:30:10] [TRACE]   [OmnipathR] Loading all UniProt records for organism 10116 (only reviewed: TRUE); fields: id,genes(PREFERRED)
64
+[2022-08-24 10:30:10] [TRACE]   [OmnipathR] Looking up in cache: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no`.
65
+[2022-08-24 10:30:10] [INFO]    [OmnipathR] Cache record does not exist: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no`
66
+[2022-08-24 10:30:10] [TRACE]   [OmnipathR] Could not find in cache, initiating download: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no`.
67
+[2022-08-24 10:30:10] [INFO]    [OmnipathR] Cache item `13e562cc918b5226343ec314c33bee5c6e514e1f` version 1: status changed from `unknown` to `started`.
68
+[2022-08-24 10:30:10] [TRACE]   [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/13e562cc918b5226343ec314c33bee5c6e514e1f-1.rds`.
69
+[2022-08-24 10:30:10] [INFO]    [OmnipathR] Retrieving URL: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no`
70
+[2022-08-24 10:30:10] [TRACE]   [OmnipathR] Attempt 1/3: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no`
71
+[2022-08-24 10:30:12] [TRACE]   [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/13e562cc918b5226343ec314c33bee5c6e514e1f-1.rds`.
72
+[2022-08-24 10:30:12] [INFO]    [OmnipathR] Download ready [key=13e562cc918b5226343ec314c33bee5c6e514e1f, version=1]
73
+[2022-08-24 10:30:12] [INFO]    [OmnipathR] Cache item `13e562cc918b5226343ec314c33bee5c6e514e1f` version 1: status changed from `started` to `ready`.
74
+[2022-08-24 10:30:12] [SUCCESS] [OmnipathR] UniProt (legacy.uniprot.org): downloaded 8160 records
75
+[2022-08-24 13:30:12] [TRACE]   [OmnipathR] Removing database `Ensembl organism names` (not used in the past 10800 seconds)
... ...
@@ -4,6 +4,7 @@ utils::globalVariables("where")
4 4
 
5 5
 ############### PCA ###############
6 6
 
7
+# TODO: Currently, the "standard PCA workflow with vst transform based on raw counts is not possible anymore, the normalized counts are taken as they are
7 8
 
8 9
 #' Produce a PCA plot of the data from a \code{\linkS4class{GRN}} object
9 10
 #'
... ...
@@ -11,7 +12,8 @@ utils::globalVariables("where")
11 12
 #' @template outputFolder
12 13
 #' @param data Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Default \code{c("rna", "peaks")}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data).
13 14
 #' @param topn Integer vector. Default \code{c(500,1000,5000)}. Number of top variable features to do PCA for. Can be a vector of different numbers (see default).
14
-#' @param type Character. One of or a combination of \code{"raw"}, \code{"normalized"}, \code{"all"}. Default \code{c("raw", "normalized")}. Should the PCA plots be done based on the raw or normalized data, respectively?
15
+#' @param type Character. Must be \code{"normalized"}. On which data type (raw or normalized) should the PCA plots be done? We removed support for \code{raw} and currently only support \code{normalized}.
16
+#' @param removeFiltered Logical. \code{TRUE} or \code{FALSE}. Default \code{TRUE}. Should features marked as filtered as determined by \code{filterData} be removed?
15 17
 #' @template forceRerun
16 18
 #' @template basenameOutput
17 19
 #' @template plotAsPDF
... ...
@@ -22,10 +24,10 @@ utils::globalVariables("where")
22 24
 #' @examples 
23 25
 #' # See the Workflow vignette on the GRaNIE website for examples
24 26
 #' GRN = loadExampleObject()
25
-#' GRN = plotPCA_all(GRN, topn = 500, data = "rna", type = "raw", plotAsPDF = FALSE, pages = 1)
27
+#' GRN = plotPCA_all(GRN, topn = 500, data = "rna", type = "normalized", plotAsPDF = FALSE, pages = 1)
26 28
 #' @export
27 29
 plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, 
28
-                        data = c("rna", "peaks"), topn = c(500,1000,5000), type = c("raw", "normalized"), 
30
+                        data = c("rna", "peaks"), topn = c(500,1000,5000), type = "normalized", removeFiltered = TRUE,
29 31
                         plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, pages = NULL,
30 32
                         forceRerun = FALSE) {
31 33
   
... ...
@@ -36,7 +38,9 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL,
36 38
   GRN = .makeObjectCompatible(GRN)
37 39
   
38 40
   checkmate::assertSubset(data , c("rna", "peaks", "all"), empty.ok = FALSE)
39
-  checkmate::assertSubset(type , c("raw", "normalized", "all"), empty.ok = FALSE)
41
+  checkmate::assertChoice(type , c("normalized"))
42
+  #checkmate::assertSubset(type , c("raw", "normalized", "all"), empty.ok = FALSE)
43
+  checkmate::assertFlag(removeFiltered)
40 44
   checkmate::assertFlag(plotAsPDF)
41 45
   checkmate::assertNumeric(pdf_width, lower = 5, upper = 99)
42 46
   checkmate::assertNumeric(pdf_height, lower = 5, upper = 99)
... ...
@@ -56,106 +60,53 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL,
56 60
   # Initialize the page counter
57 61
   pageCounter = 1 
58 62
   
63
+  # For normalized data, we do not do any other transformation currently and plot the normalized data "as is"
64
+  # Changed on 08.02.22: We now never do a(nother) log transform, as the input for the density plot is already transformed counts. 
65
+  transformationCur = "none"
66
+  logTransformDensity = FALSE
67
+  
59 68
   if ("rna" %in% data | "all" %in% data) {
60 69
     
61
-    # Check if we have raw integer counts. Ust vst transform if, otherwise "regular log2
62
-    transformation = dplyr::if_else(checkmate::testClass(GRN@data$RNA$counts_orig, "DESeqDataSet"), "vst", "log2")
63
-    
64
-    normVector = c("raw", "normalized")
65
-    # If data is pre-normalized, dont do any transformation, as "raw" and "normalized" are supposed to be identical
66
-    if (GRN@config$parameters$normalization_rna == "none") {
67
-      normVector = c("normalized")
68
-    }
69
-    
70
-    for (norm in normVector) {
71
-      
72
-      if (!(norm %in% type | "all" %in% type)) {
73
-        next
74
-      }
75
-      
76
-      # Only do a transformation if this is raw counts, already normalized counts dont need another transformation
77
-      transformationCur = dplyr::if_else(norm == "raw", transformation, "none")
78
-      
79
-      # Changed on 08.02.22: We now never do a(nother) log transform, as the input for the density plot is already transformed counts. 
80
-      # logTransformDensity = dplyr::if_else(norm == "raw", TRUE, FALSE)
81
-      logTransformDensity = FALSE
82
-      
83
-      # Get filtered rows from normalized version always, this is correct, as isFiltered is missing in raw counts
84
-      nonFilteredRows = which(! GRN@data$RNA$counts_norm.l[["0"]]$isFiltered)
85
-      
86
-      if (norm == "raw") {
87
-        
88
-        matrixCur = GRN@data$RNA$counts_orig[nonFilteredRows, uniqueSamples]
89
-        
90
-      } else {
91
-        
92
-        matrixCur = GRN@data$RNA$counts_norm.l[["0"]][nonFilteredRows, uniqueSamples] %>% as.matrix()
93
-        rownames(matrixCur) = GRN@data$RNA$counts_norm.l[["0"]]$ENSEMBL[nonFilteredRows]
94
-      }
95
-      
96
-      fileCur = paste0(outputFolder, 
97
-                       dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_pca"), basenameOutput),
98
-                       "_RNA.", norm, ".pdf")
99
-      if (!file.exists(fileCur) | forceRerun) {
70
+    matrixCur = getCounts(GRN, type = "rna", asMatrix = TRUE, includeFiltered = !removeFiltered)
71
+
72
+    fileCur = paste0(outputFolder, 
73
+                     dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_pca"), basenameOutput),
74
+                     "_RNA.", type, ".pdf")
75
+    if (!file.exists(fileCur) | forceRerun) {
100 76
         
101
-        futile.logger::flog.info(paste0("Plotting PCA and metadata correlation of ", norm, 
77
+        futile.logger::flog.info(paste0("Plotting PCA and metadata correlation of ", type, 
102 78
                                         " RNA data for all shared samples to file ", fileCur , 
103 79
                                         "... This may take a few minutes"))
104 80
         
105 81
         .plot_PCA_wrapper(GRN, matrixCur, transformation = transformationCur, logTransformDensity = logTransformDensity, file = fileCur, topn = topn, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, plotAsPDF = plotAsPDF)
106 82
         
107
-      } else {
83
+    } else {
108 84
         futile.logger::flog.info(paste0("File ", fileCur, " already exists, not overwriting due to forceRerun = FALSE"))
109 85
         
110
-      }
111 86
     }
112 87
     
88
+    
113 89
   } # end if plot RNA PCA
114 90
   
115 91
   if ("peaks" %in% data | "all" %in% data) {
116 92
     
117
-    # Check if we have raw integer counts
118
-    transformation = dplyr::if_else(checkmate::testClass(GRN@data$peaks$counts_orig, "DESeqDataSet"), "vst", "log2")
93
+    matrixCur = getCounts(GRN, type = "peaks", asMatrix = TRUE, includeFiltered = !removeFiltered)
119 94
     
120
-    normVector = c("raw", "normalized")
121
-    # If data is pre-normalized, dont do any transformation, as "raw" and "normalized" are supposed to be identical
122
-    if (GRN@config$parameters$normalization_peaks == "none") {
123
-      normVector = c("normalized")
124
-    }
95
+    fileCur = paste0(outputFolder, dplyr::if_else(is.null(basenameOutput), 
96
+                                                  .getOutputFileName("plot_pca"), basenameOutput), "_peaks.", type, ".pdf")
125 97
     
126
-    for (norm in normVector) {
127
-      
128
-      transformationCur = dplyr::if_else(norm == "raw", transformation, "none")
129
-      logTransformDensity = dplyr::if_else(norm == "raw", TRUE, FALSE)
130
-      
131
-      nonFilteredRows = which(! GRN@data$peaks$counts_norm$isFiltered)
132
-      
133
-      if (norm == "raw") {
134
-        
135
-        matrixCur = GRN@data$peaks$counts_orig[nonFilteredRows, uniqueSamples]
98
+    if (!file.exists(fileCur) | forceRerun) {
136 99
         
137
-      } else {
138
-        
139
-        matrixCur = GRN@data$peaks$counts_norm[nonFilteredRows, uniqueSamples] %>% as.matrix()
140
-        rownames(matrixCur) = GRN@data$peaks$counts_norm$peakID[nonFilteredRows]
141
-      }
142
-      
143
-      fileCur = paste0(outputFolder, dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_pca"), basenameOutput), "_peaks.", norm, ".pdf")
144
-      
145
-      if (!file.exists(fileCur) | forceRerun) {
146
-        
147
-        futile.logger::flog.info(paste0("Plotting PCA and metadata correlation of ", norm, 
100
+        futile.logger::flog.info(paste0("Plotting PCA and metadata correlation of ", type, 
148 101
                                         " peaks data for all shared samples to file ", fileCur , 
149 102
                                         "... This may take a few minutes"))
150 103
         
151 104
         .plot_PCA_wrapper(GRN, matrixCur, transformation = transformationCur, logTransformDensity = logTransformDensity, file = fileCur, topn = topn, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, plotAsPDF = plotAsPDF)
152 105
         
153
-      } else {
106
+    } else {
154 107
         futile.logger::flog.info(paste0("File ", fileCur, " already exists, not overwriting due to forceRerun = FALSE"))
155 108
         
156
-      }
157
-      
158
-    } # end for raw and norm
109
+    }
159 110
     
160 111
   }
161 112
   
... ...
@@ -934,7 +885,7 @@ plotDiagnosticPlots_peakGene <- function(GRN,
934 885
   
935 886
   
936 887
   # For compatibility reasons, re-create the genes and peaks annotation if not present in the object
937
-  if (! "peak.annotation" %in% colnames(GRN@annotation$consensusPeaks)) {
888
+  if (! "peak.annotation" %in% colnames(GRN@annotation$peaks)) {
938 889
     GRN = .populatePeakAnnotation(GRN)
939 890
   }
940 891
   if (! "gene.CV" %in% colnames(GRN@annotation$genes)) {
... ...
@@ -1038,7 +989,7 @@ plotDiagnosticPlots_peakGene <- function(GRN,
1038