Browse code

Several small bugfixes and vignette adjustments related to path-adjustments

Christian Arnold authored on 18/03/2022 18:45:44
Showing24 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: GRaNIE
2 2
 Title: GRaNIE: Reconstruction cell type specific gene regulatory networks including enhancers using chromatin accessibility and RNA-seq data
3
-Version: 0.99.0
3
+Version: 0.99.1
4 4
 Encoding: UTF-8
5 5
 Authors@R: c(person("Christian", "Arnold", email =
6 6
         "chrarnold@web.de", role = c("cre","aut")),
... ...
@@ -11,6 +11,7 @@ export(calculateCommunitiesEnrichment)
11 11
 export(calculateCommunitiesStats)
12 12
 export(calculateGeneralEnrichment)
13 13
 export(calculateTFEnrichment)
14
+export(changeOutputDirectory)
14 15
 export(deleteIntermediateData)
15 16
 export(filterData)
16 17
 export(filterGRNAndConnectGenes)
... ...
@@ -16,10 +16,10 @@
16 16
 initializeGRN <- function(objectMetadata = list(),
17 17
                           outputFolder, 
18 18
                           genomeAssembly) {
19
-
19
+  
20 20
   checkmate::assert(checkmate::checkNull(objectMetadata), checkmate::checkList(objectMetadata))
21 21
   checkmate::assertSubset(genomeAssembly, c("hg19","hg38", "mm10"))
22
-
22
+  
23 23
   # Create the folder first if not yet existing
24 24
   if (!dir.exists(outputFolder)) {
25 25
     dir.create(outputFolder)
... ...
@@ -44,7 +44,7 @@ initializeGRN <- function(objectMetadata = list(),
44 44
   GRN = .addFunctionLogToObject(GRN)
45 45
   
46 46
   GRN@config$isFiltered = FALSE
47
-
47
+  
48 48
   par.l = list()
49 49
   
50 50
   packageName = utils::packageName()
... ...
@@ -80,7 +80,7 @@ initializeGRN <- function(objectMetadata = list(),
80 80
   GRN@config$directories$outputRoot         = outputFolder
81 81
   GRN@config$directories$output_plots       = dir_output_plots 
82 82
   GRN@config$files$output_log               = paste0(outputFolder, "GRN.log")
83
-
83
+  
84 84
   .testExistanceAndCreateDirectoriesRecursively(c(outputFolder, dir_output_plots))
85 85
   
86 86
   checkmate::assertDirectory(outputFolder, access = "w")
... ...
@@ -125,7 +125,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
125 125
                     counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
126 126
                     allowOverlappingPeaks= FALSE,
127 127
                     forceRerun = FALSE) {
128
-
128
+  
129 129
   GRN = .addFunctionLogToObject(GRN)      
130 130
   
131 131
   checkmate::assertClass(GRN, "GRN")
... ...
@@ -142,7 +142,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
142 142
       forceRerun) {
143 143
     
144 144
     # Store raw peaks counts as DESeq object
145
-
145
+    
146 146
     #GRN@data$peaks$counts_raw = counts_peaks %>% dplyr::mutate(isFiltered = FALSE)
147 147
     
148 148
     # Normalize ID column names
... ...
@@ -152,31 +152,31 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
152 152
     if (idColumn_RNA != "ENSEMBL") {
153 153
       counts_rna = dplyr::rename(counts_rna, ENSEMBL = !!(idColumn_RNA))
154 154
     }
155
-      
155
+    
156 156
     # Check ID columns for missing values and remove
157 157
     rna_missing_ID =  which(is.na(counts_rna$ENSEMBL))
158 158
     if (length(rna_missing_ID) > 0) {
159
-        message = paste0(" Found ", length(rna_missing_ID), " missing IDs in the ID column of the RNA counts. These rows will be removed.")
160
-        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
161
-        counts_rna = dplyr::slice(counts_rna, -rna_missing_ID)
159
+      message = paste0(" Found ", length(rna_missing_ID), " missing IDs in the ID column of the RNA counts. These rows will be removed.")
160
+      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
161
+      counts_rna = dplyr::slice(counts_rna, -rna_missing_ID)
162 162
     }
163 163
     
164 164
     peaks_missing_ID =  which(is.na(counts_peaks$peakID))
165 165
     if (length(peaks_missing_ID) > 0) {
166
-        message = paste0(" Found ", length(peaks_missing_ID), " missing IDs in the ID column of the peaks counts. These rows will be removed.")
167
-        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
168
-        counts_peaks = dplyr::slice(counts_peaks, -peaks_missing_ID)
166
+      message = paste0(" Found ", length(peaks_missing_ID), " missing IDs in the ID column of the peaks counts. These rows will be removed.")
167
+      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
168
+      counts_peaks = dplyr::slice(counts_peaks, -peaks_missing_ID)
169 169
     }
170 170
     
171
-      
171
+    
172 172
     # Remove potential scientific notation from peak IDs
173 173
     peaks_eNotation = which(grepl("e+", counts_peaks$peakID))
174 174
     if (length(peaks_eNotation) > 0) {
175
-        message = paste0("Found at least one peak (", paste0(counts_peaks$peakID[peaks_eNotation], collapse = ",") , ") for which the position contains the scientific notation, attempting to fix.")
176
-        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
177
-        counts_peaks$peakID[peaks_eNotation] = .removeScientificNotation_positions(counts_peaks$peakID[peaks_eNotation])
178
-   
179
-        
175
+      message = paste0("Found at least one peak (", paste0(counts_peaks$peakID[peaks_eNotation], collapse = ",") , ") for which the position contains the scientific notation, attempting to fix.")
176
+      .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
177
+      counts_peaks$peakID[peaks_eNotation] = .removeScientificNotation_positions(counts_peaks$peakID[peaks_eNotation])
178
+      
179
+      
180 180
     }
181 181
     
182 182
     # Clean Ensembl IDs
... ...
@@ -189,10 +189,10 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
189 189
       counts_rna = counts_rna %>%
190 190
         dplyr::group_by(ENSEMBL) %>%
191 191
         dplyr::summarise_if(is.numeric, sum) 
192
-        # dplyr::summarise_if(is.numeric, sum, .groups = 'drop') # the .drop caused an error with dplyr 1.0.5
192
+      # dplyr::summarise_if(is.numeric, sum, .groups = 'drop') # the .drop caused an error with dplyr 1.0.5
193 193
     }
194 194
     
195
-
195
+    
196 196
     # Normalize counts
197 197
     countsPeaks.norm.df  = .normalizeCounts(counts_peaks, method = normalization_peaks, idColumn = "peakID")
198 198
     countsRNA.norm.df  = .normalizeCounts(counts_rna, method = normalization_rna, idColumn = "ENSEMBL")
... ...
@@ -219,20 +219,20 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
219 219
     # Create permutations for RNA
220 220
     futile.logger::flog.info(paste0( "Produce ", .getMaxPermutation(GRN), " permutations of RNA-counts"))
221 221
     GRN@data$RNA$counts_norm.l = .shuffleColumns(countsRNA.norm.df, .getMaxPermutation(GRN), returnUnshuffled = TRUE, returnAsList = TRUE)
222
-
222
+    
223 223
     if (!is.null(sampleMetadata)) {
224
-        
224
+      
225 225
       futile.logger::flog.info("Parsing provided metadata...")
226 226
       GRN@data$metadata = sampleMetadata %>% dplyr::distinct() %>% tibble::set_tidy_names(syntactic = TRUE, quiet = TRUE)
227 227
       
228 228
       # Force the first column to be the ID column
229 229
       if ("sampleID" %in% colnames(GRN@data$metadata)) {
230
-          futile.logger::flog.warn("Renaming the first column to sampleID. However, this column already exists, it will be renamed accordingly.")
231
-          colnames(GRN@data$metadata)[which(colnames(GRN@data$metadata) == "sampleID")] = "sampleID_original"
232
-          
230
+        futile.logger::flog.warn("Renaming the first column to sampleID. However, this column already exists, it will be renamed accordingly.")
231
+        colnames(GRN@data$metadata)[which(colnames(GRN@data$metadata) == "sampleID")] = "sampleID_original"
232
+        
233 233
       } 
234 234
       colnames(GRN@data$metadata)[1] = "sampleID"
235
-
235
+      
236 236
       # Assume the ID is in column 1, has to be unique
237 237
       if (nrow(GRN@data$metadata) > length(unique(GRN@data$metadata$sampleID))) {
238 238
         message = paste0("The first column in the sample metadata table must contain only unique values, as it is used as sample ID. Make sure the values are unique.")
... ...
@@ -252,8 +252,8 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
252 252
       dplyr::mutate(has_RNA = sampleID  %in% samples_rna,
253 253
                     has_peaks = sampleID %in% samples_peaks,
254 254
                     has_both = has_RNA & has_peaks
255
-                    )
256
-
255
+      )
256
+    
257 257
     GRN@config$sharedSamples = dplyr::filter(GRN@data$metadata, has_both) %>% dplyr::pull(sampleID) %>% as.character()
258 258
     
259 259
     counts_peaks = as.data.frame(counts_peaks)
... ...
@@ -264,11 +264,11 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
264 264
     
265 265
     
266 266
     # Store the raw peaks data efficiently as DESeq object only if it contains only integers, otherwise store as normal matrix
267
-
267
+    
268 268
     if (isIntegerMatrix(counts_peaks[, GRN@config$sharedSamples])) {
269 269
       GRN@data$peaks$counts_orig = DESeq2::DESeqDataSetFromMatrix(counts_peaks[, GRN@config$sharedSamples], 
270
-                                                           colData = dplyr::filter(GRN@data$metadata, has_both) %>% tibble::column_to_rownames("sampleID"),
271
-                                                           design = ~1)
270
+                                                                  colData = dplyr::filter(GRN@data$metadata, has_both) %>% tibble::column_to_rownames("sampleID"),
271
+                                                                  design = ~1)
272 272
     } else {
273 273
       
274 274
       GRN@data$peaks$counts_orig = as.matrix(counts_peaks[, GRN@config$sharedSamples])
... ...
@@ -276,14 +276,14 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
276 276
     
277 277
     if (isIntegerMatrix(counts_rna[, GRN@config$sharedSamples])) {
278 278
       GRN@data$RNA$counts_orig = DESeq2::DESeqDataSetFromMatrix(counts_rna[, GRN@config$sharedSamples], 
279
-                                                          colData = dplyr::filter(GRN@data$metadata, has_both) %>% tibble::column_to_rownames("sampleID"),
280
-                                                          design = ~1)   
279
+                                                                colData = dplyr::filter(GRN@data$metadata, has_both) %>% tibble::column_to_rownames("sampleID"),
280
+                                                                design = ~1)   
281 281
     } else {
282
-        GRN@data$RNA$counts_orig = as.matrix(counts_rna[, GRN@config$sharedSamples])
282
+      GRN@data$RNA$counts_orig = as.matrix(counts_rna[, GRN@config$sharedSamples])
283 283
     }
284 284
     
285
-
286
-  
285
+    
286
+    
287 287
     # Consensus peaks
288 288
     GRN@data$peaks$consensusPeaks = .createConsensusPeaksDF(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)) 
289 289
     GRN@data$peaks$consensusPeaks = GRN@data$peaks$consensusPeaks[match(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID, GRN@data$peaks$consensusPeaks$peakID), ]
... ...
@@ -294,29 +294,29 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
294 294
     # Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details
295 295
     consensus.gr   = .constructGRanges(GRN@data$peaks$consensusPeaks, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE)
296 296
     
297
-
297
+    
298 298
     
299 299
     overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) >1)
300 300
     
301 301
     if (length(overlappingPeaks) > 0){
302
+      
303
+      ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID
304
+      
305
+      messageAll = paste0(" ", length(overlappingPeaks), 
306
+                          " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","),
307
+                          ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. "
308
+      )
309
+      
310
+      
311
+      if (allowOverlappingPeaks) {
302 312
         
303
-        ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID
304
-        
305
-        messageAll = paste0(" ", length(overlappingPeaks), 
306
-                            " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","),
307
-                            ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. "
308
-        )
309
-        
310
-        
311
-        if (allowOverlappingPeaks) {
312
-            
313
-            message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.")
314
-            .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
315
-        } else {
316
-            message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function")
317
-            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
318
-        }
319
-        
313
+        message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.")
314
+        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
315
+      } else {
316
+        message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function")
317
+        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
318
+      }
319
+      
320 320
     }
321 321
     
322 322
   }
... ...
@@ -326,7 +326,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
326 326
   
327 327
   # Add gene annotation once
328 328
   GRN = .populateGeneAnnotation(GRN)
329
-
329
+  
330 330
   GRN
331 331
   
332 332
 }
... ...
@@ -358,14 +358,14 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
358 358
 }
359 359
 
360 360
 .removeScientificNotation_positions <- function(peakIDs.vec) {
361
-    ids = strsplit(peakIDs.vec, split = ":", fixed = TRUE)
362
-    ids_chr = sapply(ids, "[[", 1)
363
-    ids_pos = sapply(ids, "[[", 2)
364
-    ids_pos = strsplit(ids_pos, split = "-", fixed = TRUE)
365
-    start = sapply(ids_pos, "[[", 1)
366
-    end   = sapply(ids_pos, "[[", 2)
367
-    
368
-    paste0(ids_chr, ":", format(as.integer(start), scientific = FALSE), "-", format(as.integer(end), scientific= FALSE))
361
+  ids = strsplit(peakIDs.vec, split = ":", fixed = TRUE)
362
+  ids_chr = sapply(ids, "[[", 1)
363
+  ids_pos = sapply(ids, "[[", 2)
364
+  ids_pos = strsplit(ids_pos, split = "-", fixed = TRUE)
365
+  start = sapply(ids_pos, "[[", 1)
366
+  end   = sapply(ids_pos, "[[", 2)
367
+  
368
+  paste0(ids_chr, ":", format(as.integer(start), scientific = FALSE), "-", format(as.integer(end), scientific= FALSE))
369 369
 }
370 370
 
371 371
 
... ...
@@ -412,90 +412,90 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
412 412
 }
413 413
 
414 414
 .populatePeakAnnotation <- function (GRN) {
415
+  
416
+  countsPeaks.clean = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
417
+  
418
+  futile.logger::flog.info(paste0(" Calculate statistics for each peak (mean and CV)"))
419
+  
420
+  countsPeaks.m = as.matrix(dplyr::select(countsPeaks.clean, -peakID))
421
+  
422
+  rowMeans_peaks   = rowMeans(countsPeaks.m)
423
+  rowMedians_peaks = matrixStats::rowMedians(countsPeaks.m)
424
+  CV_peaks = matrixStats::rowSds(countsPeaks.m) /  rowMeans_peaks
425
+  
426
+  metadata_peaks = tibble::tibble(peak.ID = countsPeaks.clean$peakID, 
427
+                                  peak.mean = rowMeans_peaks, 
428
+                                  peak.median = rowMedians_peaks, 
429
+                                  peak.CV = CV_peaks)
430
+  
431
+  GRN@annotation$consensusPeaks = metadata_peaks
432
+  
433
+  if (!is.installed("ChIPseeker")) {
434
+    message = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", 
435
+                     " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.")
436
+    .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
437
+  } else {
415 438
     
416
-    countsPeaks.clean = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
417
-    
418
-    futile.logger::flog.info(paste0(" Calculate statistics for each peak (mean and CV)"))
419
-    
420
-    countsPeaks.m = as.matrix(dplyr::select(countsPeaks.clean, -peakID))
421
-    
422
-    rowMeans_peaks   = rowMeans(countsPeaks.m)
423
-    rowMedians_peaks = matrixStats::rowMedians(countsPeaks.m)
424
-    CV_peaks = matrixStats::rowSds(countsPeaks.m) /  rowMeans_peaks
425
-    
426
-    metadata_peaks = tibble::tibble(peak.ID = countsPeaks.clean$peakID, 
427
-                                    peak.mean = rowMeans_peaks, 
428
-                                    peak.median = rowMedians_peaks, 
429
-                                    peak.CV = CV_peaks)
430
-    
431
-    GRN@annotation$consensusPeaks = metadata_peaks
432
-    
433
-    if (!is.installed("ChIPseeker")) {
434
-        message = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", 
435
-                         " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.")
436
-        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
437
-    } else {
438
-        
439
-        futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while"))
440
-        genomeAssembly = GRN@config$parameters$genomeAssembly
441
-        consensusPeaks     = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!isFiltered)
442
-        consensusPeaks.gr  = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
443
-        
444
-        # Add ChIPSeeker anotation
445
-        peaks.annotated = suppressMessages(ChIPseeker::annotatePeak(
446
-            consensusPeaks.gr,
447
-            tssRegion = c(-5000, 5000), # extended from 3kb to 5
448
-            TxDb = .getGenomeObject(genomeAssembly, type = "txbd"),
449
-            level = "gene", 
450
-            assignGenomicAnnotation = TRUE,  # the default
451
-            genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
452
-                                          "Downstream", "Intergenic"),  # the default
453
-            annoDb = .getGenomeObject(genomeAssembly, type = "packageName"), # optional, if provided, extra columns including SYMBOL, GENENAME, ENSEMBL/ENTREZID will be added
454
-            sameStrand = FALSE, # the default
455
-            ignoreOverlap = FALSE, # the default
456
-            ignoreUpstream = FALSE, # the default
457
-            ignoreDownstream = FALSE, # the default
458
-            overlap = "TSS", # the default
459
-            verbose = TRUE # the default
460
-        ))
461
-        
462
-        GRN@annotation$consensusPeaks_obj = peaks.annotated
463
-        
464
-        peaks.annotated.df = as.data.frame(peaks.annotated)
465
-        peaks.annotated.df$annotation[grepl("Exon", peaks.annotated.df$annotation)] = "Exon"
466
-        peaks.annotated.df$annotation[grepl("Intron", peaks.annotated.df$annotation)] = "Intron"
467
-        
468
-        GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, 
469
-                                                         peaks.annotated.df  %>% 
470
-                                                             dplyr::select(peakID, annotation, tidyselect::starts_with("gene"), -geneId, distanceToTSS, ENSEMBL, SYMBOL, GENENAME) %>%
471
-                                                             dplyr::mutate(annotation  = as.factor(annotation), 
472
-                                                                           ENSEMBL = as.factor(ENSEMBL), 
473
-                                                                           GENENAME = as.factor(GENENAME),
474
-                                                                           SYMBOL = as.factor(SYMBOL)),
475
-                                                         by = c("peak.ID" = "peakID")) %>%
476
-            dplyr::rename(peak.gene.chr = geneChr,
477
-                          peak.gene.start = geneStart, 
478
-                          peak.gene.end = geneEnd, 
479
-                          peak.gene.length = geneLength, 
480
-                          peak.gene.strand = geneStrand, 
481
-                          peak.gene.name = GENENAME,
482
-                          peak.gene.distanceToTSS = distanceToTSS,
483
-                          peak.gene.ENSEMBL = ENSEMBL,
484
-                          peak.gene.symbol = SYMBOL,
485
-                          peak.annotation = annotation
486
-            )
487
-        
488
-        
489
-    }
490
-    
491
-    
492
-    
493
-    
494
-    # Also add GC content as annotation columns
495
-    GRN = .calcGCContentPeaks(GRN)
439
+    futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while"))
440
+    genomeAssembly = GRN@config$parameters$genomeAssembly
441
+    consensusPeaks     = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!isFiltered)
442
+    consensusPeaks.gr  = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
443
+    
444
+    # Add ChIPSeeker anotation
445
+    peaks.annotated = suppressMessages(ChIPseeker::annotatePeak(
446
+      consensusPeaks.gr,
447
+      tssRegion = c(-5000, 5000), # extended from 3kb to 5
448
+      TxDb = .getGenomeObject(genomeAssembly, type = "txbd"),
449
+      level = "gene", 
450
+      assignGenomicAnnotation = TRUE,  # the default
451
+      genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
452
+                                    "Downstream", "Intergenic"),  # the default
453
+      annoDb = .getGenomeObject(genomeAssembly, type = "packageName"), # optional, if provided, extra columns including SYMBOL, GENENAME, ENSEMBL/ENTREZID will be added
454
+      sameStrand = FALSE, # the default
455
+      ignoreOverlap = FALSE, # the default
456
+      ignoreUpstream = FALSE, # the default
457
+      ignoreDownstream = FALSE, # the default
458
+      overlap = "TSS", # the default
459
+      verbose = TRUE # the default
460
+    ))
461
+    
462
+    GRN@annotation$consensusPeaks_obj = peaks.annotated
463
+    
464
+    peaks.annotated.df = as.data.frame(peaks.annotated)
465
+    peaks.annotated.df$annotation[grepl("Exon", peaks.annotated.df$annotation)] = "Exon"
466
+    peaks.annotated.df$annotation[grepl("Intron", peaks.annotated.df$annotation)] = "Intron"
467
+    
468
+    GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, 
469
+                                                     peaks.annotated.df  %>% 
470
+                                                       dplyr::select(peakID, annotation, tidyselect::starts_with("gene"), -geneId, distanceToTSS, ENSEMBL, SYMBOL, GENENAME) %>%
471
+                                                       dplyr::mutate(annotation  = as.factor(annotation), 
472
+                                                                     ENSEMBL = as.factor(ENSEMBL), 
473
+                                                                     GENENAME = as.factor(GENENAME),
474
+                                                                     SYMBOL = as.factor(SYMBOL)),
475
+                                                     by = c("peak.ID" = "peakID")) %>%
476
+      dplyr::rename(peak.gene.chr = geneChr,
477
+                    peak.gene.start = geneStart, 
478
+                    peak.gene.end = geneEnd, 
479
+                    peak.gene.length = geneLength, 
480
+                    peak.gene.strand = geneStrand, 
481
+                    peak.gene.name = GENENAME,
482
+                    peak.gene.distanceToTSS = distanceToTSS,
483
+                    peak.gene.ENSEMBL = ENSEMBL,
484
+                    peak.gene.symbol = SYMBOL,
485
+                    peak.annotation = annotation
486
+      )
496 487
     
497
-    GRN
498 488
     
489
+  }
490
+  
491
+  
492
+  
493
+  
494
+  # Also add GC content as annotation columns
495
+  GRN = .calcGCContentPeaks(GRN)
496
+  
497
+  GRN
498
+  
499 499
 }
500 500
 
501 501
 .populateGeneAnnotation <- function (GRN) {
... ...
@@ -536,52 +536,52 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
536 536
 #' @importFrom IRanges width
537 537
 #' @importFrom rlang .data `:=`
538 538
 .calcGCContentPeaks <- function(GRN) {
539
-    
540
-    futile.logger::flog.info(paste0("Calculate GC-content for peaks... "))
541
-    start = Sys.time()
542
-    genomeAssembly = GRN@config$parameters$genomeAssembly
543
-    #TODO: GC content for all peaks
544
-    genome = .getGenomeObject(genomeAssembly, type = "BSgenome")
545
-    
546
-    # Get peaks as GRanges object
547
-    query   = .constructGRanges(GRN@data$peaks$consensusPeaks, 
548
-                                seqlengths = .getChrLengths(genomeAssembly), 
549
-                                genomeAssembly)
550
-    
551
-    # Get DNAStringSet object
552
-    seqs_peaks = Biostrings::getSeq(genome, query)
553
-    
554
-    GC_content.df = Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT")
555
-    
556
-    GC_content.df = GC_content.df %>%
557
-        tibble::as_tibble() %>%
558
-        dplyr::mutate(length = IRanges::width(query),
559
-                      peak.ID = query$peakID,
560
-                      GC_class = cut(`G|C`, breaks = seq(0,1,0.1), include.lowest = TRUE, ordered_result = TRUE))
561
-    
562
-    GC_classes.df = GC_content.df %>%
563
-        dplyr::group_by(GC_class) %>%
564
-        dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(length), peak_width_sd = sd(length)) %>%
565
-        dplyr::ungroup() %>% 
566
-        tidyr::complete(GC_class, fill = list(n = 0)) %>%
567
-        dplyr::mutate(n_rel = .data$n / length(query))
568
-    
569
-    # TODO: Put where
570
-    #ggplot(GC_content.df, aes(GC.class)) + geom_histogram(stat = "count") + theme_bw()
571
-    
572
-    #ggplot(GC_classes.df , aes(GC.class, n_rel)) + geom_bar(stat = "identity") + theme_bw()
573
-    
574
-    GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, GC_content.df, by = "peak.ID") %>%
575
-        dplyr::rename( peak.GC.perc    = `G|C`,
576
-                       peak.width      = length,
577
-                       peak.GC.class   = GC_class)
578
-    
579
-    GRN@stats$peaks = list()
580
-    GRN@stats$peaks$GC = GC_classes.df
581
-    
582
-    .printExecutionTime(start)
583
-    
584
-    GRN
539
+  
540
+  futile.logger::flog.info(paste0("Calculate GC-content for peaks... "))
541
+  start = Sys.time()
542
+  genomeAssembly = GRN@config$parameters$genomeAssembly
543
+  #TODO: GC content for all peaks
544
+  genome = .getGenomeObject(genomeAssembly, type = "BSgenome")
545
+  
546
+  # Get peaks as GRanges object
547
+  query   = .constructGRanges(GRN@data$peaks$consensusPeaks, 
548
+                              seqlengths = .getChrLengths(genomeAssembly), 
549
+                              genomeAssembly)
550
+  
551
+  # Get DNAStringSet object
552
+  seqs_peaks = Biostrings::getSeq(genome, query)
553
+  
554
+  GC_content.df = Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT")
555
+  
556
+  GC_content.df = GC_content.df %>%
557
+    tibble::as_tibble() %>%
558
+    dplyr::mutate(length = IRanges::width(query),
559
+                  peak.ID = query$peakID,
560
+                  GC_class = cut(`G|C`, breaks = seq(0,1,0.1), include.lowest = TRUE, ordered_result = TRUE))
561
+  
562
+  GC_classes.df = GC_content.df %>%
563
+    dplyr::group_by(GC_class) %>%
564
+    dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(length), peak_width_sd = sd(length)) %>%
565
+    dplyr::ungroup() %>% 
566
+    tidyr::complete(GC_class, fill = list(n = 0)) %>%
567
+    dplyr::mutate(n_rel = .data$n / length(query))
568
+  
569
+  # TODO: Put where
570
+  #ggplot(GC_content.df, aes(GC.class)) + geom_histogram(stat = "count") + theme_bw()
571
+  
572
+  #ggplot(GC_classes.df , aes(GC.class, n_rel)) + geom_bar(stat = "identity") + theme_bw()
573
+  
574
+  GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, GC_content.df, by = "peak.ID") %>%
575
+    dplyr::rename( peak.GC.perc    = `G|C`,
576
+                   peak.width      = length,
577
+                   peak.GC.class   = GC_class)
578
+  
579
+  GRN@stats$peaks = list()
580
+  GRN@stats$peaks$GC = GC_classes.df
581
+  
582
+  .printExecutionTime(start)
583
+  
584
+  GRN
585 585
 }
586 586
 
587 587
 #' Filter data from a \code{\linkS4class{GRN}} object
... ...
@@ -637,8 +637,8 @@ filterData <- function (GRN,
637 637
     if(!is.null(GRN@data$TFs$TF_peak_overlap)) {
638 638
       GRN@data$TFs$TF_peak_overlap[, "isFiltered"] = 0
639 639
     }
640
-  
641
-
640
+    
641
+    
642 642
     # Filter peaks
643 643
     futile.logger::flog.info("FILTER PEAKS")
644 644
     peakIDs.CV = .filterPeaksByMeanCV(GRN, 
... ...
@@ -660,7 +660,7 @@ filterData <- function (GRN,
660 660
     #GRN@data$peaks$counts_raw$isFiltered = ! GRN@data$peaks$counts_raw$peakID  %in% peaks_toKeep
661 661
     GRN@data$peaks$counts_norm$isFiltered = ! GRN@data$peaks$counts_norm$peakID  %in% peaks_toKeep
662 662
     
663
-
663
+    
664 664
     if(!is.null(GRN@data$TFs$TF_peak_overlap)) {
665 665
       GRN@data$TFs$TF_peak_overlap[, "isFiltered"] = as.integer (! rownames(GRN@data$TFs$TF_peak_overlap) %in% peaks_toKeep)
666 666
     }
... ...
@@ -671,9 +671,9 @@ filterData <- function (GRN,
671 671
     # Filter peaks
672 672
     futile.logger::flog.info("FILTER RNA-seq")
673 673
     genes.CV = .filterGenesByMeanCV(GRN, 
674
-                                      minMean = minNormalizedMeanRNA, maxMean = maxNormalizedMeanRNA, 
675
-                                      minCV = minCV_genes, maxCV = maxCV_genes) 
676
- 
674
+                                    minMean = minNormalizedMeanRNA, maxMean = maxNormalizedMeanRNA, 
675
+                                    minCV = minCV_genes, maxCV = maxCV_genes) 
676
+    
677 677
     futile.logger::flog.info(paste0(" Number of rows in total: ", nrow(GRN@data$RNA$counts_norm.l[["0"]])))
678 678
     for (permutationCur in c(0)) {
679 679
       permIndex = as.character(permutationCur)
... ...
@@ -681,7 +681,7 @@ filterData <- function (GRN,
681 681
       GRN@data$RNA$counts_norm.l[[permIndex]]$isFiltered = rowMeans < minNormalizedMeanRNA 
682 682
     }
683 683
     nRowsFlagged = length(which(GRN@data$RNA$counts_norm.l[["0"]]$isFiltered))
684
-
684
+    
685 685
     # Raw counts are left untouched and filtered where needed only
686 686
     futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows because the row mean was smaller than ", minNormalizedMeanRNA))
687 687
     
... ...
@@ -699,7 +699,7 @@ filterData <- function (GRN,
699 699
   startTime = Sys.time()
700 700
   
701 701
   if (is.null(minSize_peaks)) {
702
-      minSize_peaks = 1
702
+    minSize_peaks = 1
703 703
   }
704 704
   
705 705
   futile.logger::flog.info(paste0("Filter and sort peaks and remain only those on the following chromosomes: ", paste0(chrToKeep, collapse = ",")))
... ...
@@ -730,111 +730,111 @@ filterData <- function (GRN,
730 730
 }
731 731
 
732 732
 .filterPeaksByCV <- function (GRN, minCV = 0, maxCV = 1e7) {
733
+  
734
+  startTime = Sys.time()
735
+  
736
+  if (is.null(maxCV)) {
737
+    futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV))
733 738
     
734
-    startTime = Sys.time()
735
-    
736
-    if (is.null(maxCV)) {
737
-        futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV))
738
-        
739
-        peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, peak.CV >= minCV)
740
-        
741
-    } else {
742
-        futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV, ", max CV = ", maxCV))
743
-        
744
-        peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, peak.CV >= minCV, peak.CV <= maxCV)
745
-    }
746
-    
747
-    futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
739
+    peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, peak.CV >= minCV)
748 740
     
749
-    .printExecutionTime(startTime)
741
+  } else {
742
+    futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV, ", max CV = ", maxCV))
750 743
     
751
-    peaksFiltered$peak.ID
744
+    peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, peak.CV >= minCV, peak.CV <= maxCV)
745
+  }
746
+  
747
+  futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
748
+  
749
+  .printExecutionTime(startTime)
750
+  
751
+  peaksFiltered$peak.ID
752 752
 }
753 753
 
754 754
 .filterPeaksByMeanCV <- function (GRN, minMean = 0, maxMean = NULL, minCV = 0, maxCV = NULL) {
755
+  
756
+  startTime = Sys.time()
757
+  
758
+  futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$consensusPeaks)))
759
+  
760
+  if (is.null(minCV)) {
761
+    minCV = 0
762
+  }
763
+  
764
+  if (is.null(maxCV)) {
765
+    futile.logger::flog.info(paste0("  Filter peaks by CV: Min = ", minCV))
766
+    maxCV = 9e+99
755 767
     
756
-    startTime = Sys.time()
757
-    
758
-    futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$consensusPeaks)))
759
-    
760
-    if (is.null(minCV)) {
761
-        minCV = 0
762
-    }
763
-    
764
-    if (is.null(maxCV)) {
765
-        futile.logger::flog.info(paste0("  Filter peaks by CV: Min = ", minCV))
766
-        maxCV = 9e+99
767
-        
768
-    } else {
769
-        futile.logger::flog.info(paste0("  Filter peaks by CV: Min = ", minCV, ", Max = ", maxCV))
770
-    }
771
-    
772
-    
773
-    if (is.null(minMean)) {
774
-        minMean = 0
775
-    }
776
-    
777
-    if (is.null(maxMean)) {
778
-        futile.logger::flog.info(paste0("  Filter peaks by mean: Min = ", minMean))
779
-        maxMean = 9e+99
780
-    } else {
781
-        futile.logger::flog.info(paste0("  Filter peaks by mean: Min = ", minMean, ", Max = ", maxMean))  
782
-    }   
783
-    
784
-    
785
-    peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, 
786
-                                  peak.CV >= minCV, peak.CV <= maxCV, 
787
-                                  peak.mean >= minMean, peak.mean <= maxMean)
788
-    
789
-    futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
790
-    
791
-    .printExecutionTime(startTime)
792
-    
793
-    peaksFiltered$peak.ID
768
+  } else {
769
+    futile.logger::flog.info(paste0("  Filter peaks by CV: Min = ", minCV, ", Max = ", maxCV))
770
+  }
771
+  
772
+  
773
+  if (is.null(minMean)) {
774
+    minMean = 0
775
+  }
776
+  
777
+  if (is.null(maxMean)) {
778
+    futile.logger::flog.info(paste0("  Filter peaks by mean: Min = ", minMean))
779
+    maxMean = 9e+99
780
+  } else {
781
+    futile.logger::flog.info(paste0("  Filter peaks by mean: Min = ", minMean, ", Max = ", maxMean))  
782
+  }   
783
+  
784
+  
785
+  peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, 
786
+                                peak.CV >= minCV, peak.CV <= maxCV, 
787
+                                peak.mean >= minMean, peak.mean <= maxMean)
788
+  
789
+  futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
790
+  
791
+  .printExecutionTime(startTime)
792
+  
793
+  peaksFiltered$peak.ID
794 794
 }
795 795
 
796 796
 .filterGenesByMeanCV <- function (GRN, minMean = 0, maxMean = NULL, minCV = 0, maxCV = NULL) {
797
+  
798
+  startTime = Sys.time()
799
+  
800
+  futile.logger::flog.info(paste0(" Number of genes before filtering : ", nrow(GRN@annotation$genes)))
801
+  
802
+  if (is.null(minCV)) {
803
+    minCV = 0
804
+  }
805
+  
806
+  if (is.null(maxCV)) {
807
+    futile.logger::flog.info(paste0("  Filter genes by CV: Min = ", minCV))
808
+    maxCV = 9e+99
797 809
     
798
-    startTime = Sys.time()
799
-    
800
-    futile.logger::flog.info(paste0(" Number of genes before filtering : ", nrow(GRN@annotation$genes)))
801
-    
802
-    if (is.null(minCV)) {
803
-        minCV = 0
804
-    }
805
-    
806
-    if (is.null(maxCV)) {
807
-        futile.logger::flog.info(paste0("  Filter genes by CV: Min = ", minCV))
808
-        maxCV = 9e+99
809
-        
810
-    } else {
811
-        futile.logger::flog.info(paste0("  Filter genes by CV: Min = ", minCV, ", Max = ", maxCV))
812
-    }
813
-    
814
-    
815
-    if (is.null(minMean)) {
816
-        minMean = 0
817
-    }
818
-    
819
-    
820
-    if (is.null(maxMean)) {
821
-        futile.logger::flog.info(paste0("  Filter genes by mean: Min = ", minMean))
822
-        maxMean = 9e+99
823
-    } else {
824
-        futile.logger::flog.info(paste0("  Filter genes by mean: Min = ", minMean, ", Max = ", maxMean))  
825
-    }   
826
-    
827
-    
828
-    genesFiltered = dplyr::filter(GRN@annotation$genes, 
829
-                                  gene.CV >= minCV, gene.CV <= maxCV, 
830
-                                  gene.mean >= minMean, gene.mean <= maxMean)
831
-
832
-    
833
-    futile.logger::flog.info(paste0(" Number of genes after filtering : ", nrow(genesFiltered)))
834
-    
835
-    .printExecutionTime(startTime)
836
-    
837
-    genesFiltered$gene.ENSEMBL 
810
+  } else {
811
+    futile.logger::flog.info(paste0("  Filter genes by CV: Min = ", minCV, ", Max = ", maxCV))
812
+  }
813
+  
814
+  
815
+  if (is.null(minMean)) {
816
+    minMean = 0
817
+  }
818
+  
819
+  
820
+  if (is.null(maxMean)) {
821
+    futile.logger::flog.info(paste0("  Filter genes by mean: Min = ", minMean))
822
+    maxMean = 9e+99
823
+  } else {
824
+    futile.logger::flog.info(paste0("  Filter genes by mean: Min = ", minMean, ", Max = ", maxMean))  
825
+  }   
826
+  
827
+  
828
+  genesFiltered = dplyr::filter(GRN@annotation$genes, 
829
+                                gene.CV >= minCV, gene.CV <= maxCV, 
830
+                                gene.mean >= minMean, gene.mean <= maxMean)
831
+  
832
+  
833
+  futile.logger::flog.info(paste0(" Number of genes after filtering : ", nrow(genesFiltered)))
834
+  
835
+  .printExecutionTime(startTime)
836
+  
837
+  genesFiltered$gene.ENSEMBL 
838 838
 }
839 839
 
840 840
 
... ...
@@ -855,9 +855,9 @@ filterData <- function (GRN,
855 855
 #' # See the Workflow vignette on the GRaNIE website for examples
856 856
 #' @export
857 857
 addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPattern = "_TFBS", fileEnding = ".bed", forceRerun = FALSE) {
858
-
858
+  
859 859
   GRN = .addFunctionLogToObject(GRN)
860
-    
860
+  
861 861
   checkmate::assertClass(GRN, "GRN")
862 862
   checkmate::assertDirectoryExists(motifFolder)
863 863
   checkmate::assertCharacter(TFs, min.len = 1)
... ...
@@ -867,12 +867,12 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
867 867
   checkmate::assertFlag(forceRerun)
868 868
   
869 869
   if (is.null(GRN@data$TFs$translationTable) | is.null(GRN@data$TFs$translationTable) | is.null(GRN@config$allTF)  | is.null(GRN@config$directories$motifFolder) | forceRerun) {
870
-
870
+    
871 871
     GRN@config$TFBS_fileEnding  = fileEnding
872 872
     GRN@config$TFBS_filePattern = filesTFBSPattern
873 873
     GRN@data$TFs$translationTable = .getFinalListOfTFs(motifFolder, filesTFBSPattern, fileEnding, TFs, nTFMax, getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE))
874 874
     
875
-
875
+    
876 876
     GRN@data$TFs$translationTable = GRN@data$TFs$translationTable %>%
877 877
       dplyr::select(c("HOCOID", "ENSEMBL")) %>%
878 878
       dplyr::mutate(TF.name = HOCOID, TF.ENSEMBL = ENSEMBL) 
... ...
@@ -884,8 +884,8 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
884 884
     # GRN@config$TF_list = list()
885 885
     # GRN@config$TF_list[["all_TFBS"]] =GRN@config$allTF
886 886
     GRN@config$directories$motifFolder = motifFolder
887
-  
888
-  
887
+    
888
+    
889 889
   } 
890 890
   
891 891
   GRN
... ...
@@ -899,9 +899,9 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
899 899
   files = .createFileList(folder_input_TFBS, "*.bed*", recursive = FALSE, ignoreCase = FALSE, verbose = FALSE)
900 900
   TFsWithTFBSPredictions = gsub(pattern = filesTFBSPattern, "", tools::file_path_sans_ext(basename(files), compression = TRUE))
901 901
   TFsWithTFBSPredictions = gsub(pattern = fileEnding, "", TFsWithTFBSPredictions)
902
-
902
+  
903 903
   futile.logger::flog.info(paste0("Found ", length(TFsWithTFBSPredictions), " matching TFs: ", paste0(TFsWithTFBSPredictions, collapse = ", ")))
904
-
904
+  
905 905
   
906 906
   # Filter TFs
907 907
   if (length(TFs) == 1 && TFs == "all") {
... ...
@@ -946,7 +946,7 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
946 946
       allTF = allTF[seq_len(nTFMax)]
947 947
       futile.logger::flog.info(paste0("Updated list of TFs: ", paste0(allTF, collapse = ", ")))
948 948
     } 
949
-
949
+    
950 950
   }
951 951
   
952 952
   nTF = length(allTF)
... ...
@@ -974,18 +974,18 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
974 974
 #' GRN = overlapPeaksAndTFBS(GRN, nCores = 2, forceRerun = FALSE)
975 975
 #' @export
976 976
 overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
977
-
977
+  
978 978
   GRN = .addFunctionLogToObject(GRN)
979
-    
979
+  
980 980
   checkmate::assertClass(GRN, "GRN")
981 981
   checkmate::assertIntegerish(nCores, lower = 1)
982 982
   checkmate::assertFlag(forceRerun)
983 983
   
984 984
   if (is.null(GRN@data$TFs$TF_peak_overlap) | forceRerun) {
985 985
     
986
-
986
+    
987 987
     futile.logger::flog.info(paste0("Overlap peaks and TFBS using ", nCores, " cores. This may take a few minutes..."))
988
-
988
+    
989 989
     genomeAssembly = GRN@config$parameters$genomeAssembly
990 990
     seqlengths = .getChrLengths(genomeAssembly)
991 991
     
... ...
@@ -995,7 +995,7 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
995 995
       message = "Could not retrieve value from GRN@config$TFBS_filePattern. Please rerun the function addTFBS, as this was added in a recent version of the package."
996 996
       .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
997 997
     }
998
-   
998
+    
999 999
     
1000 1000
     # Check whether we have peaks on chromosomes not part of the sequence length reference. If yes, discard them
1001 1001
     annotation_discared = dplyr::filter(GRN@data$peaks$consensusPeaks, ! .data$chr %in% names(seqlengths))
... ...
@@ -1006,8 +1006,8 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
1006 1006
       tbl_discarded = tbl_discarded[which(tbl_discarded > 0)]
1007 1007
       
1008 1008
       futile.logger::flog.warn(paste0("Found ", sum(tbl_discarded), " regions from chromosomes without a reference length. ", 
1009
-                       "Typically, these are random fragments from known or unknown chromosomes. The following regions will be discarded: \n",
1010
-                       paste0(names(tbl_discarded), " (", tbl_discarded, ")", collapse = ",")))
1009
+                                      "Typically, these are random fragments from known or unknown chromosomes. The following regions will be discarded: \n",
1010
+                                      paste0(names(tbl_discarded), " (", tbl_discarded, ")", collapse = ",")))
1011 1011
       
1012 1012
       GRN@data$peaks$consensusPeaks = dplyr::filter(GRN@data$peaks$consensusPeaks, .data$chr %in% names(seqlengths))
1013 1013
     }
... ...
@@ -1027,9 +1027,9 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
1027 1027
     if (!all(colnames(TFBS_bindingMatrix.df) %in% GRN@config$allTF)) {
1028 1028
       
1029 1029
       message = paste0("Internal mismatch detected between the TF names and the TF names derived from the translation file (see log, column HOCOID).", 
1030
-                        "This may happen if the genome assembly version has been changed, but intermediate files have not been properly recreated. ",
1031
-                        "Set the parameter forceRerun to TRUE and rerun the script.")
1032
-
1030
+                       "This may happen if the genome assembly version has been changed, but intermediate files have not been properly recreated. ",
1031
+                       "Set the parameter forceRerun to TRUE and rerun the script.")
1032
+      
1033 1033
       .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
1034 1034
     }
1035 1035
     
... ...
@@ -1037,11 +1037,11 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
1037 1037
     filteredPeaks = dplyr::filter(GRN@data$peaks$counts_norm, isFiltered) %>% dplyr::pull(peakID)
1038 1038
     # Collect binary 0/1 binding matrix from all TF and concatenate
1039 1039
     GRN@data$TFs$TF_peak_overlap = TFBS_bindingMatrix.df %>%
1040
-        dplyr::mutate(peakID = GRN@data$peaks$consensusPeaks$peakID,
1041
-                      isFiltered = peakID %in% filteredPeaks) %>%
1042
-        dplyr::mutate_if(is.logical, as.numeric) %>%
1043
-        dplyr::select(tidyselect::all_of(sort(GRN@config$allTF)), isFiltered)
1044
-        
1040
+      dplyr::mutate(peakID = GRN@data$peaks$consensusPeaks$peakID,
1041
+                    isFiltered = peakID %in% filteredPeaks) %>%
1042
+      dplyr::mutate_if(is.logical, as.numeric) %>%
1043
+      dplyr::select(tidyselect::all_of(sort(GRN@config$allTF)), isFiltered)
1044
+    
1045 1045
     GRN@data$TFs$TF_peak_overlap = .asSparseMatrix(as.matrix(GRN@data$TFs$TF_peak_overlap), 
1046 1046
                                                    convertNA_to_zero = FALSE, 
1047 1047
                                                    dimnames = list(GRN@data$peaks$consensusPeaks$peakID, colnames(GRN@data$TFs$TF_peak_overlap)))
... ...
@@ -1050,7 +1050,7 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
1050 1050
     # We resort it to match the countsPeaks.norm
1051 1051
     # TODO: Here could be an error due to differences in sorting. Also, whether or not all or the filtered version shall be used
1052 1052
     stopifnot(identical(rownames(GRN@data$TFs$TF_peak_overlap), GRN@data$peaks$counts_norm$peakID))
1053
-
1053
+    
1054 1054
   } 
1055 1055
   
1056 1056
   GRN
... ...
@@ -1233,7 +1233,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1233 1233
   checkmate::assertChoice(normalization, c("cyclicLoess", "sizeFactors", "quantile", "none"))
1234 1234
   checkmate::assertCharacter(name, min.chars = 1, len = 1)
1235 1235
   checkmate::assertFlag(forceRerun)
1236
-
1236
+  
1237 1237
   forbiddenNames = "expression"
1238 1238
   .checkForbiddenNames(name, forbiddenNames)
1239 1239
   
... ...
@@ -1247,7 +1247,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1247 1247
       tibble::as_tibble()
1248 1248
     
1249 1249
     countsPeaks = .normalizeNewDataWrapper(GRN@data$peaks$counts_orig, normalization = normalization)
1250
-
1250
+    
1251 1251
     stopifnot(identical(nrow(countsPeaks), nrow(GRN@data$TFs$TF_peak_overlap)))
1252 1252
     
1253 1253
     #Select a maximum set of TFs to run this for
... ...
@@ -1258,10 +1258,10 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1258 1258
     # Calculating TF activity is done for all TF that are available
1259 1259
     TF.activity.m = matrix(NA, nrow = length(allTF), ncol = length(GRN@config$sharedSamples), 
1260 1260
                            dimnames = list(allTF, GRN@config$sharedSamples))
1261
-  
1261
+    
1262 1262
     
1263 1263
     pb <- progress::progress_bar$new(total = length(allTF))
1264
-
1264
+    
1265 1265
     for (TFCur in allTF) {
1266 1266
       
1267 1267
       pb$tick()
... ...
@@ -1297,7 +1297,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1297 1297
     
1298 1298
     futile.logger::flog.info(paste0("Data already exists in object (slot ", name, "), nothing to do. Set forceRerun = TRUE to regenerate and overwrite."))
1299 1299
   }
1300
- 
1300
+  
1301 1301
   
1302 1302
   .printExecutionTime(start)
1303 1303
   
... ...
@@ -1416,7 +1416,7 @@ importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF
1416 1416
     
1417 1417
     forbiddenNames = c("TF_activity", "expression")
1418 1418
     .checkForbiddenNames(name, forbiddenNames)
1419
-
1419
+    
1420 1420
     idColumns = idColumn
1421 1421
     data = dplyr::rename(data, TF.name = !!(nameColumn))
1422 1422
     idColumns = c(idColumns, "TF.name")
... ...
@@ -1481,7 +1481,7 @@ importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF
1481 1481
     
1482 1482
     
1483 1483
   } else {
1484
-      
1484
+    
1485 1485
     futile.logger::flog.info(paste0("Data already exists in object, nothing to do. Set forceRerun = TRUE to regenerate and overwrite."))
1486 1486
     
1487 1487
   }
... ...
@@ -1507,7 +1507,7 @@ importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF
1507 1507
 #' @examples 
1508 1508
 #' # See the Workflow vignette on the GRaNIE website for examples
1509 1509
 #' GRN = loadExampleObject()
1510
-#' GRN = AR_classification_wrapper(GRN, forceRerun = FALSE)
1510
+#' GRN = AR_classification_wrapper(GRN, outputFolder = ".", forceRerun = FALSE)
1511 1511
 #' @export
1512 1512
 AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05, 
1513 1513
                                       plot_minNoTFBS_heatmap = 100, deleteIntermediateData = TRUE,
... ...
@@ -1516,7 +1516,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1516 1516
                                       forceRerun = FALSE) {
1517 1517
   
1518 1518
   GRN = .addFunctionLogToObject(GRN)
1519
-    
1519
+  
1520 1520
   checkmate::assertClass(GRN, "GRN")
1521 1521
   checkmate::assertNumber(significanceThreshold_Wilcoxon, lower = 0, upper = 1)
1522 1522
   checkmate::assertNumber(plot_minNoTFBS_heatmap, lower = 1)
... ...
@@ -1544,7 +1544,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1544 1544
   for (connectionTypeCur in connectionTypes) {
1545 1545
     
1546 1546
     futile.logger::flog.info(paste0(" Connection type ", connectionTypeCur, "\n"))
1547
-
1547
+    
1548 1548
     for (permutationCur in allPermutations) {
1549 1549
       
1550 1550
       futile.logger::flog.info(paste0(" ", .getPermStr(permutationCur), "\n"))
... ...
@@ -1562,6 +1562,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1562 1562
           is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background) |
1563 1563
           is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground) |
1564 1564
           is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background) |
1565
+          is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor) |
1565 1566
           forceRerun
1566 1567
       ) {
1567 1568
         
... ...
@@ -1569,17 +1570,17 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1569 1570
         
1570 1571
         if (connectionTypeCur == "expression") {
1571 1572
           counts1 = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))
1572
-
1573
+          
1573 1574
         } else {
1574 1575
           
1575 1576
           # TF activity data
1576 1577
           counts1 = GRN@data$TFs[[connectionTypeCur]] %>% 
1577 1578
             dplyr::select(-TF.name)
1578
-
1579
+          
1579 1580
         } 
1580 1581
         
1581 1582
         futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts"))
1582
-
1583
+        
1583 1584
         counts_peaks = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
1584 1585
         
1585 1586
         TF_peak_cor = .correlateMatrices(matrix1      = counts1, 
... ...
@@ -1592,13 +1593,15 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1592 1593
         GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background = res.l[["median_background"]]
1593 1594
         GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground   = res.l[["foreground"]]
1594 1595
         GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background   = res.l[["background"]]
1596
+        
1597
+        GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor = TF_peak_cor
1595 1598
       }
1596 1599
       
1597 1600
       # Final classification: Calculate thresholds by calculating the quantiles of the background and compare the real values to the background
1598 1601
       if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l) | forceRerun) {
1599 1602
         GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l = 
1600 1603
           .calculate_classificationThresholds(.asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), 
1601
-                                             GRN@config$parameters)
1604
+                                              GRN@config$parameters)
1602 1605
       }
1603 1606
       
1604 1607
       if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification) | forceRerun) {
... ...
@@ -1627,9 +1630,9 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1627 1630
         fileCur = paste0(outputFolder, .getOutputFileName("plot_class_density"), "_", connectionTypeCur, suffixFile)
1628 1631
         if (!file.exists(fileCur) | forceRerun) {
1629 1632
           .plot_density(.asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground),
1630
-                       .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), 
1631
-                       corMethod,
1632
-                       fileCur, width = 5, height = 5)
1633
+                        .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), 
1634
+                        corMethod,
1635
+                        fileCur, width = 5, height = 5)
1633 1636
         } else {
1634 1637
           futile.logger::flog.info(paste0("  File ", fileCur, " already exists, not overwriting since forceRerun = FALSE"))
1635 1638
         }
... ...
@@ -1649,17 +1652,19 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1649 1652
         
1650 1653
         fileCur = paste0(outputFolder, .getOutputFileName("plot_class_densityClass"), "_", connectionTypeCur, suffixFile)
1651 1654
         if (!file.exists(fileCur) | forceRerun) {
1655
+          
1656
+          TF_peak_cor = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor
1652 1657
           peak_TF_overlapCur.df = .filterSortAndShuffle_peakTF_overlapTable(GRN, permutationCur, TF_peak_cor)
1653 1658
           .plot_heatmapAR(TF.peakMatrix.df = peak_TF_overlapCur.df, 
1654
-                         HOCOMOCO_mapping.df.exp = GRN@data$TFs$translationTable %>% dplyr::mutate(TF = TF.name), 
1655
-                         sort.cor.m = TF_peak_cor, 
1656
-                         par.l = GRN@config$parameters, 
1657
-                         corMethod = corMethod,
1658
-                         median.cor.tfs = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground), 
1659
-                         median.cor.tfs.non = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background), 
1660
-                         act.rep.thres.l = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l, 
1661
-                         finalClassification = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification,
1662
-                         file = fileCur, width = 8, height = 15)
1659
+                          HOCOMOCO_mapping.df.exp = GRN@data$TFs$translationTable %>% dplyr::mutate(TF = TF.name), 
1660
+                          sort.cor.m = TF_peak_cor, 
1661
+                          par.l = GRN@config$parameters, 
1662
+                          corMethod = corMethod,
1663
+                          median.cor.tfs = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground), 
1664
+                          median.cor.tfs.non = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background), 
1665
+                          act.rep.thres.l = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l, 
1666
+                          finalClassification = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification,
1667
+                          file = fileCur, width = 8, height = 15)
1663 1668
         } else {
1664 1669
           futile.logger::flog.info(paste0("  File ", fileCur, " already exists, not overwriting since forceRerun = FALSE"))
1665 1670
         }
... ...
@@ -1725,7 +1730,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
1725 1730
 #' @examples 
1726 1731
 #' # See the Workflow vignette on the GRaNIE website for examples
1727 1732
 #' GRN = loadExampleObject()
1728
-#' GRN = addConnections_TF_peak(GRN, forceRerun = FALSE)
1733
+#' GRN = addConnections_TF_peak(GRN, outputFolder = ".", forceRerun = FALSE)
1729 1734
 #' @export
1730 1735
 addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails = FALSE, outputFolder = NULL, 
1731 1736
                                     corMethod = "pearson", 
... ...
@@ -1734,9 +1739,9 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1734 1739
                                     maxFDRToStore = 0.3, 
1735 1740
                                     useGCCorrection = FALSE, percBackground_size = 75, percBackground_resample = TRUE,
1736 1741
                                     forceRerun = FALSE) {
1737
-    
1742
+  
1738 1743
   GRN = .addFunctionLogToObject(GRN)
1739
-
1744
+  
1740 1745
   checkmate::assertClass(GRN, "GRN")
1741 1746
   checkmate::assertFlag(plotDiagnosticPlots)
1742 1747
   checkmate::assertFlag(plotDetails)
... ...
@@ -1744,18 +1749,18 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1744 1749
   
1745 1750
   GRN = .checkAndUpdateConnectionTypes(GRN) # For compatibility with older versions
1746 1751
   checkmate::assertSubset(connectionTypes, GRN@config$TF_peak_connectionTypes)
1747
-
1752
+  
1748 1753
   checkmate::assertLogical(removeNegativeCorrelation, any.missing = FALSE, len = length(connectionTypes))
1749 1754
   
1750 1755
   
1751 1756
   #checkmate::assert(checkmate::checkSubset(add_TFActivity, c("none", "calculate", names(slot(GRN, "data")[["TFs"]]))))
1752
-
1757
+  
1753 1758
   checkmate::assertNumber(maxFDRToStore, lower = 0, upper = 1)
1754 1759
   checkmate::assertFlag(useGCCorrection)
1755 1760
   checkmate::assertNumber(percBackground_size, lower = 0, upper = 100)
1756 1761
   checkmate::assertFlag(percBackground_resample)
1757 1762
   checkmate::assertFlag(forceRerun)
1758
-
1763
+  
1759 1764
   
1760 1765
   if (is.null(GRN@connections$TF_peaks) | forceRerun) {
1761 1766
     
... ...
@@ -1763,12 +1768,12 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1763 1768
     
1764 1769
     GRN@config$parameters$corMethod_TF_Peak = corMethod
1765 1770
     GRN@config$parameters$useGCCorrection = useGCCorrection 
1766
-
1771
+    
1767 1772
     if (is.null(GRN@data$TFs$TF_peak_overlap)) {
1768
-        message = paste0("Could not find peak - TF matrix. Run the function overlapPeaksAndTFBS first / again")
1769
-        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
1773
+      message = paste0("Could not find peak - TF matrix. Run the function overlapPeaksAndTFBS first / again")
1774
+      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
1770 1775
     }
1771
-
1776
+    
1772 1777
     for (permutationCur in 0:.getMaxPermutation(GRN)) {
1773 1778
       
1774 1779
       futile.logger::flog.info(paste0("\n", .getPermStr(permutationCur), "\n"))
... ...
@@ -1792,11 +1797,11 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1792 1797
       
1793 1798
     } #end for each permutation
1794 1799
     
1795
-
1800
+    
1796 1801
     if (plotDiagnosticPlots) {
1797
-        
1798
-        plotDiagnosticPlots_TFPeaks(GRN, outputFolder = outputFolder, plotDetails = FALSE, forceRerun = forceRerun)
1799
-        
1802
+      
1803
+      plotDiagnosticPlots_TFPeaks(GRN, outputFolder = outputFolder, plotDetails = FALSE, forceRerun = forceRerun)
1804
+      
1800 1805
     }
1801 1806
     
1802 1807
   }
... ...
@@ -1810,10 +1815,10 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1810 1815
                                 removeNegativeCorrelation, maxFDRToStore = 0.3 , percBackground_size = 75, percBackground_resample = TRUE, plotDetails = FALSE) {
1811 1816
   
1812 1817
   start = Sys.time()
1813
-
1818
+  
1814 1819
   if (plotDetails) {
1815
-      message = "Plotting details is not supported currently. Set plotDetails = FALSE."
1816
-      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
1820
+    message = "Plotting details is not supported currently. Set plotDetails = FALSE."
1821
+    .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
1817 1822
   }
1818 1823
   
1819 1824
   peak_TF_overlap.df= .filterSortAndShuffle_peakTF_overlapTable(GRN, perm)
... ...
@@ -1831,17 +1836,17 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1831 1836
     
1832 1837
     futile.logger::flog.info(paste0("Calculate TF-peak links for connection type ", connectionTypeCur))
1833 1838
     start2 = Sys.time()
1834
-
1839
+    
1835 1840
     if (connectionTypeCur == "expression") {
1836 1841
       
1837 1842
       counts_connectionTypeCur = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))
1838
-
1843
+      
1839 1844
     } else {
1840 1845
       
1841 1846
       # Keep only Ensembl ID here
1842 1847
       counts_connectionTypeCur = GRN@data$TFs[[connectionTypeCur]] %>% 
1843 1848
         dplyr::select(-TF.name)
1844
-
1849
+      
1845 1850
     } 
1846 1851
     
1847 1852
     futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts"))
... ...
@@ -1872,7 +1877,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1872 1877
     sort.cor.m.sort<-peaksCor.m[,colnames(peak_TF_overlap.df)]
1873 1878
     
1874 1879
     stopifnot(identical(colnames(sort.cor.m.sort), colnames(peak_TF_overlap.df)))
1875
-
1880
+    
1876 1881
     futile.logger::flog.info(paste0("  Compute FDR for each TF. This may take a while..."))
1877 1882
     
1878 1883
     pb <- progress::progress_bar$new(total = length(allTF))
... ...
@@ -1880,9 +1885,9 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1880 1885
     peaksFiltered = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!isFiltered) 
1881 1886
     
1882 1887
     if (!useGCCorrection) {
1883
-        minPerc = 100
1884
-        # Since we do not control for this, we set it to NA
1885
-        background_match_success = NA
1888
+      minPerc = 100
1889
+      # Since we do not control for this, we set it to NA
1890
+      background_match_success = NA
1886 1891
     }
1887 1892
     
1888 1893
     for (TFCur in allTF) {
... ...
@@ -1895,7 +1900,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1895 1900
       
1896 1901
       tp <- sort.cor.m.sort[overlapYes, TFCur]
1897 1902
       n_tp       = length(tp)
1898
-
1903
+      
1899 1904
       peaksForeground =                     peaksFiltered %>% dplyr::slice(overlapYes)
1900 1905
       peaksBackground = peaksBackgroundGC = peaksFiltered %>% dplyr::slice(overlapNo)
1901 1906
       
... ...
@@ -1906,138 +1911,138 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
1906 1911
       #start = Sys.time()
1907 1912
       # GC-adjust background and select background regions according to foreground
1908 1913
       if (useGCCorrection) {
1914
+        
1915
+        fp_orig <- sort.cor.m.sort[overlapNo, TFCur]
1916
+        n_fp_orig  = length(fp_orig)
1917
+        
1918
+        # Get GC info from those peaks from the foreground
1919
+        GC_classes_foreground.df = peaksForeground %>%
1920
+          dplyr::group_by(GC_class) %>%
1921
+          dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(peak_width), peak_width_sd = sd(peak_width)) %>%
1922
+          dplyr::ungroup() %>% 
1923
+          tidyr::complete(GC_class, fill = list(n = 0)) %>%
1924
+          dplyr::mutate(n_rel = .data$n / nPeaksForeground, type = "foreground") %>%
1925
+          dplyr::arrange(dplyr::desc(n_rel))
1926
+        
1927
+        GC_classes_background.df = peaksBackground %>%
1928
+          dplyr::group_by(GC_class) %>%
1929
+          dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(peak_width), peak_width_sd = sd(peak_width)) %>%
1930
+          dplyr::ungroup() %>% 
1931
+          tidyr::complete(GC_class, fill = list(n = 0)) %>%
1932
+          dplyr::mutate(n_rel = .data$n / nPeaksBackground, type = "background_orig")
1933
+        
1934
+        
1935
+        
1936
+        background_match_success = TRUE
1937
+        
1938
+        if (!is.null(percBackground_size)) {
1939
+          minPerc = percBackground_size
1940
+        } else {
1909 1941
           
1910
-          fp_orig <- sort.cor.m.sort[overlapNo, TFCur]
1911
-          n_fp_orig  = length(fp_orig)
1912
-          
1913
-          # Get GC info from those peaks from the foreground
1914
-          GC_classes_foreground.df = peaksForeground %>%
1915
-              dplyr::group_by(GC_class) %>%
1916
-              dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(peak_width), peak_width_sd = sd(peak_width)) %>%
1917
-              dplyr::ungroup() %>% 
1918
-              tidyr::complete(GC_class, fill = list(n = 0)) %>%
1919
-              dplyr::mutate(n_rel = .data$n / nPeaksForeground, type = "foreground") %>%
1920
-              dplyr::arrange(dplyr::desc(n_rel))
1921
-          
1922
-          GC_classes_background.df = peaksBackground %>%
1923
-              dplyr::group_by(GC_class) %>%
1924
-              dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(peak_width), peak_width_sd = sd(peak_width)) %>%
1925
-              dplyr::ungroup() %>% 
1926
-              tidyr::complete(GC_class, fill = list(n = 0)) %>%
1927
-              dplyr::mutate(n_rel = .data$n / nPeaksBackground, type = "background_orig")
1928
-          
1942
+          # Find a matching background of maximal size
1943
+          minPerc= min(GC_class.all$maxSizeBackground) / nPeaksBackground
1929 1944
           
1945
+          threshold_percentage = 0.05
1946
+          minPerc = .findMaxBackgroundSize(GC_classes_foreground.df, GC_classes_background.df, peaksBackground, threshold_percentage =  threshold_percentage)
1930 1947
           
1931
-          background_match_success = TRUE
1948
+          if (minPerc == 0) {
1949
+            #futile.logger::flog.warn(paste0(" Mimicking the foreground failed for TF ", TFCur, ". The background will only be approximated as good as possible using 5% of the peaks."))
1950
+            minPerc = 5
1951
+            background_match_success = FALSE
1952
+          }
1953
+        }
1954
+        
1955
+        targetNoPeaks = minPerc/100 * nPeaksBackground
1956
+        
1957
+        # TODO: Make 1000 a parameter stored somewhere
1958
+        # Ensure a minimum no of points in background, even if this sacrifices the mimicking of the distributions
1959
+        if (targetNoPeaks < 1000) {
1932 1960
           
1933 1961
           if (!is.null(percBackground_size)) {
1934
-              minPerc = percBackground_size
1962
+            #futile.logger::flog.warn(paste0("Number of peaks in background is smaller than 1000 for TF ", TFCur, ". Increasing in steps of 5% until a value > 1000 is found.This warning results from a too low value for the parameter percBackground_size"))
1935 1963
           } else {
1936
-              
1937
-              # Find a matching background of maximal size
1938
-              minPerc= min(GC_class.all$maxSizeBackground) / nPeaksBackground
1939
-              
1940
-              threshold_percentage = 0.05
1941
-              minPerc = .findMaxBackgroundSize(GC_classes_foreground.df, GC_classes_background.df, peaksBackground, threshold_percentage =  threshold_percentage)
1942
-              
1943
-              if (minPerc == 0) {
1944
-                  #futile.logger::flog.warn(paste0(" Mimicking the foreground failed for TF ", TFCur, ". The background will only be approximated as good as possible using 5% of the peaks."))
1945
-                  minPerc = 5
1946
-                  background_match_success = FALSE
1947
-              }
1964
+            background_match_success = FALSE
1948 1965
           }
1949 1966
           
1950
-          targetNoPeaks = minPerc/100 * nPeaksBackground
1951
-          
1952
-          # TODO: Make 1000 a parameter stored somewhere
1953
-          # Ensure a minimum no of points in background, even if this sacrifices the mimicking of the distributions
1954
-          if (targetNoPeaks < 1000) {
1955
-              
1956
-              if (!is.null(percBackground_size)) {
1957
-                  #futile.logger::flog.warn(paste0("Number of peaks in background is smaller than 1000 for TF ", TFCur, ". Increasing in steps of 5% until a value > 1000 is found.This warning results from a too low value for the parameter percBackground_size"))
1958
-              } else {
1959
-                  background_match_success = FALSE
1960
-              }
1961
-              
1962
-              targetNoPeaksNew = targetNoPeaks
1963
-              while(targetNoPeaksNew < 1000) {
1964
-                  minPerc = minPerc + 5
1965
-                  targetNoPeaksNew = minPerc/100 * nPeaksBackground
1966
-              }
1967
-              
1968
-              
1967
+          targetNoPeaksNew = targetNoPeaks
1968
+          while(targetNoPeaksNew < 1000) {
1969
+            minPerc = minPerc + 5
1970
+            targetNoPeaksNew = minPerc/100 * nPeaksBackground
1969 1971
           }
1970 1972
           
1971
-          # Add sel. minPerc to table and calculate required frequencies
1972
-          
1973
-          GC_classes_all.df = dplyr::full_join(GC_classes_foreground.df, GC_classes_background.df, suffix = c(".fg",".bg"), by = "GC_class") %>%
1974
-              dplyr::mutate(maxSizeBackground = n.bg / n_rel.fg,
1975
-                            n.bg.needed = floor(n_rel.fg * targetNoPeaks), 
1976
-                            n.bg.needed.perc = n.bg / n.bg.needed) 
1977 1973
           
1974
+        }
1975
+        
1976
+        # Add sel. minPerc to table and calculate required frequencies
1977
+        
1978
+        GC_classes_all.df = dplyr::full_join(GC_classes_foreground.df, GC_classes_background.df, suffix = c(".fg",".bg"), by = "GC_class") %>%
1979
+          dplyr::mutate(maxSizeBackground = n.bg / n_rel.fg,
1980
+                        n.bg.needed = floor(n_rel.fg * targetNoPeaks), 
1981
+                        n.bg.needed.perc = n.bg / n.bg.needed) 
1982
+        
1983
+        
1984
+        #futile.logger::flog.info(paste0( " GC-adjustment: Randomly select a total of ", round(targetNoPeaks,0), 
1985
+        #                  " peaks (", minPerc, " %) from the background (out of ", nrow(peaksBackground), 
1986
+        #                  " overall) in a GC-binwise fashion to mimick the foreground distribution"))
1987
+        
1988
+        # Now we know the percentage, lets select an appropriate background
1989
+        # Sample peaks from background for each GC-bin specifically
1990
+        peakIDsSel = c()
1991
+        for (i in seq_len(nrow(GC_classes_foreground.df))) {
1978 1992
           
1979
-          #futile.logger::flog.info(paste0( " GC-adjustment: Randomly select a total of ", round(targetNoPeaks,0), 
1980
-          #                  " peaks (", minPerc, " %) from the background (out of ", nrow(peaksBackground), 
1981
-          #                  " overall) in a GC-binwise fashion to mimick the foreground distribution"))
1993
+          peaksBackgroundGCCur =  peaksBackground %>% dplyr::filter(GC_class == GC_classes_foreground.df$GC_class[i])
1982 1994
           
1983
-          # Now we know the percentage, lets select an appropriate background
1984
-          # Sample peaks from background for each GC-bin specifically
1985
-          peakIDsSel = c()
1986
-          for (i in seq_len(nrow(GC_classes_foreground.df))) {
1987
-              
1988
-              peaksBackgroundGCCur =  peaksBackground %>% dplyr::filter(GC_class == GC_classes_foreground.df$GC_class[i])
1989
-              
1990
-              if (nrow( peaksBackgroundGCCur) == 0) {
1991
-                  next
1992
-              }
1993
-              
1994
-              #Select the minimum, which for low % classes is smaller than the required number to mimic the foreground 100%
1995
-              if (percBackground_resample) {
1996
-                  nPeaksCur = GC_classes_all.df$n.bg.needed[i]    
1997
-              } else {
1998
-                  nPeaksCur = min(GC_classes_all.df$n.bg.needed[i], nrow(peaksBackgroundGCCur))
1999
-              }
2000
-              
2001
-              if (GC_classes_all.df$n.bg.needed[i] > nrow(peaksBackgroundGCCur)) {
2002
-                  peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = percBackground_resample) %>% dplyr::pull(peakID))  
2003
-              } else {
2004
-                  peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = FALSE) %>% dplyr::pull(peakID))
2005
-              }
2006
-              # Take a sample from the background, and record the row numbers
2007
-              
1995
+          if (nrow( peaksBackgroundGCCur) == 0) {
1996
+            next
2008 1997
           }
2009 1998
           
2010
-          # We cannot simply select now the peakIDs, as some peaks may be present multiple times
2011
-          #peaksBackgroundGC = peaksBackground %>% dplyr::filter(peakID %in% peakIDsSel) 
2012
-          peaksBackgroundGC = peaksBackground[match(peakIDsSel, peaksBackground$peakID),] 
2013
-          
2014
-          
2015
-          if (is.null(plots_GC.l[[TFCur]])) {
2016
-              plots_GC.l[[TFCur]] = .generateTF_GC_diagnosticPlots(TFCur, GC_classes_foreground.df, GC_classes_background.df, GC_classes_all.df, peaksForeground, peaksBackground, peaksBackgroundGC)
1999
+          #Select the minimum, which for low % classes is smaller than the required number to mimic the foreground 100%
2000
+          if (percBackground_resample) {
2001
+            nPeaksCur = GC_classes_all.df$n.bg.needed[i]    
2002
+          } else {
2003
+            nPeaksCur = min(GC_classes_all.df$n.bg.needed[i], nrow(peaksBackgroundGCCur))
2017 2004
           }
2018 2005
           
2019
-
2020
-          # Select the rows by their peak IDs
2021
-          fp    = sort.cor.m.sort[peakIDsSel, TFCur]
2006
+          if (GC_classes_all.df$n.bg.needed[i] > nrow(peaksBackgroundGCCur)) {
2007
+            peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = percBackground_resample) %>% dplyr::pull(peakID))  
2008
+          } else {
2009
+            peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = FALSE) %>% dplyr::pull(peakID))
2010
+          }
2011
+          # Take a sample from the background, and record the row numbers
2022 2012
           
2023
-          fp_orig = sort.cor.m.sort[overlapNo, TFCur]
2024
-          n_fp_orig  = length(fp_orig)
2025
-      
2013
+        }
2014
+        
2015
+        # We cannot simply select now the peakIDs, as some peaks may be present multiple times
2016
+        #peaksBackgroundGC = peaksBackground %>% dplyr::filter(peakID %in% peakIDsSel) 
2017
+        peaksBackgroundGC = peaksBackground[match(peakIDsSel, peaksBackground$peakID),] 
2018
+        
2019
+        
2020
+        if (is.null(plots_GC.l[[TFCur]])) {
2021
+          plots_GC.l[[TFCur]] = .generateTF_GC_diagnosticPlots(TFCur, GC_classes_foreground.df, GC_classes_background.df, GC_classes_all.df, peaksForeground, peaksBackground, peaksBackgroundGC)
2022
+        }
2023
+        
2024
+        
2025
+        # Select the rows by their peak IDs
2026
+        fp    = sort.cor.m.sort[peakIDsSel, TFCur]
2027
+        
2028
+        fp_orig = sort.cor.m.sort[overlapNo, TFCur]
2029
+        n_fp_orig  = length(fp_orig)
2030
+        
2026 2031
       } else {
2027
-          # TODO: Redudnant so far in this case
2028
-          fp    =  fp_orig = sort.cor.m.sort[overlapNo, TFCur]
2029
-          n_fp_orig  = length(fp_orig)
2032
+        # TODO: Redudnant so far in this case
2033
+        fp    =  fp_orig = sort.cor.m.sort[overlapNo, TFCur]
2034
+        n_fp_orig  = length(fp_orig)
2030 2035
       }
2031 2036
       
2032 2037
       n_fp = length(fp)
2033
-
2038
+      
2034 2039
       cor.peak.tf = tibble::tibble(peak.ID    = rownames(sort.cor.m.sort)[overlapYes], 
2035
-                           TF_peak.r  = sort.cor.m.sort[overlapYes, TFCur],
2036
-                           TF.name    = as.factor(TFCur),
2037
-                           TF_peak.connectionType = as.factor(connectionTypeCur))
2040
+                                   TF_peak.r  = sort.cor.m.sort[overlapYes, TFCur],
2041
+                                   TF.name    = as.factor(TFCur),
2042
+                                   TF_peak.connectionType = as.factor(connectionTypeCur))
2038 2043
       
2039 2044
       #val.sign      = (median(tp) - median(fp))
2040
-     # val.sign_orig = (median(tp) - median(fp_orig))
2045
+      # val.sign_orig = (median(tp) - median(fp_orig))
2041 2046
       
2042 2047
       #.printExecutionTime(start, "Interval 2: ")
2043 2048
       
... ...
@@ -2068,26 +2073,26 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2068 2073
         cor.peak.tf$TF_peak.r_bin <- as.character(cut(cor.peak.tf$TF_peak.r, breaks = stepsCur, right = rightOpen, include.lowest = TRUE))
2069 2074
         #.printExecutionTime(start, "Interval 3a: ")
2070 2075
         #start = Sys.time()
2071
-    
2076
+        
2072 2077
         
2073 2078
         i = 0
2074 2079
         for (thres in stepsCur) {
2075
-            i = i + 1
2080
+          i = i + 1
2081
+          
2082
+          # na.rm = TRUE for all sums here to make sure NAs will not cause a problem
2083
+          if (directionCur == "pos") {
2076 2084
             
2077
-            # na.rm = TRUE for all sums here to make sure NAs will not cause a problem
2078
-            if (directionCur == "pos") {
2079
-                
2080
-                n_tp2.vec[i]      = sum(tp>=thres, na.rm = TRUE)
2081
-                n_fp2.vec[i]      = sum(fp>=thres, na.rm = TRUE)
2082
-                n_fp2_orig.vec[i] = sum(fp_orig>=thres, na.rm = TRUE)
2083
-                
2084
-            } else {
2085
-                
2086
-                n_tp2.vec[i]      = sum(tp<thres, na.rm = TRUE)
2087
-                n_fp2.vec[i]      = sum(fp<thres, na.rm = TRUE)
2088
-                n_fp2_orig.vec[i] = sum(fp_orig<thres, na.rm = TRUE)
2089
-            }
2085
+            n_tp2.vec[i]      = sum(tp>=thres, na.rm = TRUE)
2086
+            n_fp2.vec[i]      = sum(fp>=thres, na.rm = TRUE)
2087
+            n_fp2_orig.vec[i] = sum(fp_orig>=thres, na.rm = TRUE)
2088
+            
2089
+          } else {
2090 2090
             
2091
+            n_tp2.vec[i]      = sum(tp<thres, na.rm = TRUE)
2092
+            n_fp2.vec[i]      = sum(fp<thres, na.rm = TRUE)
2093
+            n_fp2_orig.vec[i] = sum(fp_orig<thres, na.rm = TRUE)
2094
+          }
2095
+          
2091 2096
         }
2092 2097
         
2093 2098
         #.printExecutionTime(start, "Interval 3b_new: ")
... ...
@@ -2097,30 +2102,30 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2097 2102
         # The maximum number is then identical to the maximum for the true positives
2098 2103
         n_fp2_norm.vec   = (n_fp2.vec/(n_fp/n_tp))
2099 2104
         n_fp2_orig_norm.vec = (n_fp2_orig.vec/(n_fp_orig/n_tp))
2100
-
2105
+        
2101 2106
         #TODO: Decide for a variant.  +1 for raw or unnormalized values?
2102
-
2107
+        
2103 2108
         
2104 2109
         fdr.curve = tibble::tibble( 
2105
-                            TF_peak.r_bin2  = stepsCur,
2106
-                            tpvalue = n_tp2.vec, 
2107
-
2108
-                            fpvalue = n_fp2.vec,
2109
-                            fpvalue_orig = n_fp2_orig.vec,
2110
-                            
2111
-                            fpvalue_norm = n_fp2_norm.vec,
2112
-                            fpvalue_norm_orig = n_fp2_orig_norm.vec,
2113
-                            
2114
-                            TF_peak.fdr_orig   = (n_fp2_orig_norm.vec) / (n_fp2_orig_norm.vec + n_tp2.vec),
2115
-                            TF_peak.fdr     = (n_fp2_norm.vec) / (n_fp2_norm.vec + n_tp2.vec),
2116
-                            # TF_peak.fdr_orig   = (n_fp2_orig_norm.vec + 1) / (n_fp2_orig_norm.vec + n_tp2.vec + 1),
2117
-                            # TF_peak.fdr     = (n_fp2_norm.vec +1) / (n_fp2_norm.vec + n_tp2.vec + 1),
2118
-                            # 
2119
-                            TF_peak.fdr_direction     = directionCur,
2120
-                            TF_peak.r_bin      = 
2121
-                              as.character(cut(TF_peak.r_bin2, breaks = stepsCur, right = rightOpen, include.lowest = TRUE))
2110
+          TF_peak.r_bin2  = stepsCur,
2111
+          tpvalue = n_tp2.vec, 
2112
+          
2113
+          fpvalue = n_fp2.vec,
2114
+          fpvalue_orig = n_fp2_orig.vec,
2115
+          
2116
+          fpvalue_norm = n_fp2_norm.vec,
2117
+          fpvalue_norm_orig = n_fp2_orig_norm.vec,
2118
+          
2119
+          TF_peak.fdr_orig   = (n_fp2_orig_norm.vec) / (n_fp2_orig_norm.vec + n_tp2.vec),
2120
+          TF_peak.fdr     = (n_fp2_norm.vec) / (n_fp2_norm.vec + n_tp2.vec),
2121
+          # TF_peak.fdr_orig   = (n_fp2_orig_norm.vec + 1) / (n_fp2_orig_norm.vec + n_tp2.vec + 1),
2122
+          # TF_peak.fdr     = (n_fp2_norm.vec +1) / (n_fp2_norm.vec + n_tp2.vec + 1),
2123
+          # 
2124
+          TF_peak.fdr_direction     = directionCur,
2125
+          TF_peak.r_bin      = 
2126
+            as.character(cut(TF_peak.r_bin2, breaks = stepsCur, right = rightOpen, include.lowest = TRUE))
2122 2127
         ) 
2123
-
2128
+        
2124 2129
         # Derive connection summaries for all TF for both directions
2125 2130
         # Get no. of connections per bin, here make sure to also include that have n = 0
2126 2131
         
... ...
@@ -2159,10 +2164,10 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2159 2164
           dplyr::rename(n_tp = tpvalue, n_fp = fpvalue, n_fp_norm = fpvalue_norm) %>%
2160 2165
           dplyr::distinct() %>%
2161 2166
           dplyr::arrange(TF_peak.r_bin)
2162
-
2163
-      
2167
+        
2168
+        
2164 2169
         # Collect data for additional QC plots before they are filtered
2165
-            
2170
+        
2166 2171
         # Filter now high FDR connections to save space and time
2167 2172
         # DISCARD other rows altogether
2168 2173
         # Left join here is what we want, as we need this df only for "real" data
... ...
@@ -2171,81 +2176,81 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2171 2176
           dplyr::select(TF.name, TF_peak.r_bin, TF_peak.r, TF_peak.fdr, 
2172 2177
                         TF_peak.fdr_orig, peak.ID, TF_peak.fdr_direction, 
2173 2178
                         TF_peak.connectionType, tidyselect::contains("value"))
2174
-
2175
-
2179
+        
2180
+        
2176 2181
         if (!plotDetails) {
2177
-            tblFilt.df = dplyr::select(tblFilt.df, -tidyselect::contains("value"))
2182
+          tblFilt.df = dplyr::select(tblFilt.df, -tidyselect::contains("value"))
2178 2183
         }
2179 2184
         
2180 2185
         connections_all.l[[indexStr]] = tblFilt.df
2181 2186
         
2182 2187
         #.printExecutionTime(start, "Interval 4: ")
2183 2188
         #start = Sys.time()
2184
-
2189
+        
2185 2190
       } # end for directionCur in c("pos", "neg")
2186
-
2191
+      
2187 2192
     } # end for each TF
2188
-
2193
+    
2189 2194
     .printExecutionTime(start2, prefix = "  ")
2190 2195
     
2191 2196
   } # end for each connectionType
2192 2197
   
2193
-
2198
+  
2194 2199
   .printExecutionTime(start)
2195 2200
   list(main            = data.table::rbindlist(connections_all.l), 
2196 2201
        connectionStats = data.table::rbindlist(connectionStats_all.l), 
2197 2202
        plots_GC        = plots_GC.l
2198
-       )
2203
+  )
2199 2204
 }
2200 2205
 
2201 2206
 .findMaxBackgroundSize <- function (GC_classes_foreground.df, GC_classes_background.df, peaksBackground, threshold_percentage = 0.05) {
2207
+  
2208
+  # Iterate over different background sizes
2209
+  minPerc = 0
2210
+  for (percCur in c(seq(100,10,-5),5)) {
2202 2211
     
2203
-    # Iterate over different background sizes
2204
-    minPerc = 0
2205
-    for (percCur in c(seq(100,10,-5),5)) {
2206
-        
2207
-        if (minPerc > 0) next
2208
-        targetNoPeaks = percCur/100 * nrow(peaksBackground)
2209
-        
2210
-        #futile.logger::flog.info(paste0("Percentage of background: ", percCur))
2211
-        
2212
-        #Check for each GC bin in the foreground, starting with the most abundant, whether we have enough background peaks to sample from
2213
-        
2214
-        # threshold_percentage: From which percentage of GC bin frequency from the foreground should the mimicking fail?
2215
-        #The motivation is that very small bins that have no weight in the foreground will not cause a failure of the mimicking
2216
-
2217
-
2218
-        for (i in seq_len(nrow(GC_classes_foreground.df))) {
2219
-            
2220
-            n_rel    = GC_classes_foreground.df$n_rel[i]
2221
-            GC_class.cur = GC_classes_foreground.df$GC_class[i]
2222
-            
2223
-            requiredNoPeaks = round(n_rel * targetNoPeaks, 0)
2224
-            # Check in background
2225
-            availableNoPeaks = GC_classes_background.df %>% 
2226
-                dplyr::filter(GC_class == GC_class.cur) %>%
2227
-                dplyr::pull(.data$n)
2228
-            #futile.logger::flog.info(paste0(" GC.class ", GC.class.cur, ": Required: ", requiredNoPeaks, ", available: ", availableNoPeaks))
2229
-            if ( availableNoPeaks < requiredNoPeaks) {
2230
-                #futile.logger::flog.info(paste0("  Not enough"))
2231
-            }
2232
-            if (availableNoPeaks < requiredNoPeaks & n_rel > threshold_percentage) {
2233
-                #futile.logger::flog.info(paste0(" Mimicking distribution FAILED (GC class ", GC.class.cur, " could not be mimicked"))
2234
-                break
2235
-            }
2236
-            
2237
-            if (i == nrow(GC_classes_foreground.df)) {
2238
-                minPerc = percCur
2239
-                #futile.logger::flog.info(paste0("Found max. percentage of background that is able to mimick the foreground: ", percCur))
2240
-                
2241
-            }
2242
-            
2243
-        }  # end of  for (i in 1:nrow(GC_classes_foreground.df)) {
2244
-        
2212
+    if (minPerc > 0) next
2213
+    targetNoPeaks = percCur/100 * nrow(peaksBackground)
2214
+    
2215
+    #futile.logger::flog.info(paste0("Percentage of background: ", percCur))
2216
+    
2217
+    #Check for each GC bin in the foreground, starting with the most abundant, whether we have enough background peaks to sample from
2218
+    
2219
+    # threshold_percentage: From which percentage of GC bin frequency from the foreground should the mimicking fail?
2220
+    #The motivation is that very small bins that have no weight in the foreground will not cause a failure of the mimicking
2221
+    
2222
+    
2223
+    for (i in seq_len(nrow(GC_classes_foreground.df))) {
2224
+      
2225
+      n_rel    = GC_classes_foreground.df$n_rel[i]
2226
+      GC_class.cur = GC_classes_foreground.df$GC_class[i]
2227
+      
2228
+      requiredNoPeaks = round(n_rel * targetNoPeaks, 0)
2229
+      # Check in background
2230
+      availableNoPeaks = GC_classes_background.df %>% 
2231
+        dplyr::filter(GC_class == GC_class.cur) %>%
2232
+        dplyr::pull(.data$n)
2233
+      #futile.logger::flog.info(paste0(" GC.class ", GC.class.cur, ": Required: ", requiredNoPeaks, ", available: ", availableNoPeaks))
2234
+      if ( availableNoPeaks < requiredNoPeaks) {
2235
+        #futile.logger::flog.info(paste0("  Not enough"))
2236
+      }
2237
+      if (availableNoPeaks < requiredNoPeaks & n_rel > threshold_percentage) {
2238
+        #futile.logger::flog.info(paste0(" Mimicking distribution FAILED (GC class ", GC.class.cur, " could not be mimicked"))
2239
+        break
2240
+      }
2241
+      
2242
+      if (i == nrow(GC_classes_foreground.df)) {
2243
+        minPerc = percCur
2244
+        #futile.logger::flog.info(paste0("Found max. percentage of background that is able to mimick the foreground: ", percCur))
2245 2245
         
2246
-    } # end for all percentages
2246
+      }
2247
+      
2248
+    }  # end of  for (i in 1:nrow(GC_classes_foreground.df)) {
2247 2249
     
2248
-    minPerc
2250
+    
2251
+  } # end for all percentages
2252
+  
2253
+  minPerc
2249 2254
 }
2250 2255
 
2251 2256
 
... ...
@@ -2271,7 +2276,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
2271 2276
 #' @examples 
2272 2277
 #' # See the Workflow vignette on the GRaNIE website for examples
2273 2278
 #' GRN = loadExampleObject()
2274
-#' GRN = addConnections_peak_gene(GRN, promoterRange = 10000, nCores = 2, forceRerun = FALSE)
2279
+#' GRN = addConnections_peak_gene(GRN, promoterRange = 10000, outputFolder = ".")
2275 2280
 addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "pearson",
2276 2281
                                      promoterRange = 250000, TADs = NULL,
2277 2282
                                      nCores = 4, 
... ...
@@ -2282,7 +2287,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2282 2287
                                      forceRerun = FALSE) {
2283 2288
   
2284 2289
   GRN = .addFunctionLogToObject(GRN)
2285
-    
2290
+  
2286 2291
   checkmate::assertClass(GRN, "GRN")
2287 2292
   checkmate::assertChoice(overlapTypeGene, c("TSS", "full"))
2288 2293
   checkmate::assertChoice(corMethod, c("pearson", "spearman"))
... ...
@@ -2297,58 +2302,58 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2297 2302
   
2298 2303
   # As this is independent of the underlying GRN, it has to be done only once
2299 2304
   
2300
-    if (is.null(GRN@connections$peak_genes[["0"]]) | forceRerun) {
2305
+  if (is.null(GRN@connections$peak_genes[["0"]]) | forceRerun) {
2306
+    
2307
+    GRN@config$parameters$promoterRange = promoterRange
2308
+    GRN@config$parameters$corMethod_peak_gene = corMethod
2309
+    
2310
+    
2311
+    GRN@connections$peak_genes = list()
2312
+    
2313
+    
2314
+    # Check which gene types are available for the particular genome annotation
2315
+    # Use all of them to collect statistics. Filtering can be done later
2316
+    gene.types = unique(GRN@annotation$genes$gene.type)
2317
+    
2318
+    
2319
+    for (permutationCur in 0:.getMaxPermutation(GRN)) {
2301 2320
       
2302
-      GRN@config$parameters$promoterRange = promoterRange
2303
-      GRN@config$parameters$corMethod_peak_gene = corMethod
2321
+      futile.logger::flog.info(paste0("\n", .getPermStr(permutationCur), "\n"))
2304 2322
       
2305
-
2306
-      GRN@connections$peak_genes = list()
2307
-
2323
+      if (permutationCur == 0) {
2324
+        futile.logger::flog.info(paste0("Calculate peak-gene correlations for neighborhood size ", promoterRange))
2325
+        randomizePeakGeneConnections = FALSE
2326
+      } else {
2327
+        futile.logger::flog.info(paste0("Calculate random peak-gene correlations for neighborhood size ", promoterRange))
2328
+        randomizePeakGeneConnections = TRUE
2329
+      }
2308 2330
       
2309
-      # Check which gene types are available for the particular genome annotation
2310
-      # Use all of them to collect statistics. Filtering can be done later
2311
-      gene.types = unique(GRN@annotation$genes$gene.type)
2331
+      GRN@connections$peak_genes[[as.character(permutationCur)]] = 
2332
+        .calculatePeakGeneCorrelations(GRN, permutationCur,
2333
+                                       TADs = TADs,
2334
+                                       neighborhoodSize = promoterRange,
2335
+                                       gene.types = names(gene.types),
2336
+                                       corMethod = corMethod,
2337
+                                       randomizePeakGeneConnections = randomizePeakGeneConnections,
2338
+                                       overlapTypeGene,
2339
+                                       nCores = nCores,
2340
+                                       debugMode_nPlots = 0,
2341
+                                       addRobustRegression = addRobustRegression
2342
+        )
2312 2343
       
2313
-
2314
-      for (permutationCur in 0:.getMaxPermutation(GRN)) {
2315
-        
2316
-        futile.logger::flog.info(paste0("\n", .getPermStr(permutationCur), "\n"))
2317
-        
2318
-        if (permutationCur == 0) {
2319
-          futile.logger::flog.info(paste0("Calculate peak-gene correlations for neighborhood size ", promoterRange))
2320
-          randomizePeakGeneConnections = FALSE
2321
-        } else {
2322
-          futile.logger::flog.info(paste0("Calculate random peak-gene correlations for neighborhood size ", promoterRange))
2323
-          randomizePeakGeneConnections = TRUE
2324
-        }
2325
-
2326
-        GRN@connections$peak_genes[[as.character(permutationCur)]] = 
2327
-          .calculatePeakGeneCorrelations(GRN, permutationCur,
2328
-                                        TADs = TADs,
2329
-                                        neighborhoodSize = promoterRange,
2330
-                                        gene.types = names(gene.types),
2331
-                                        corMethod = corMethod,
2332
-                                        randomizePeakGeneConnections = randomizePeakGeneConnections,
2333
-                                        overlapTypeGene,
2334
-                                        nCores = nCores,
2335
-                                        debugMode_nPlots = 0,
2336
-                                        addRobustRegression = addRobustRegression
2337
-          )
2338
-        
2339
-      }
2340
-
2341
-    } 
2344
+    }
2345
+    
2346
+  } 
2342 2347
   
2343 2348
   if (plotDiagnosticPlots) {
2344 2349
     
2345 2350
     plotDiagnosticPlots_peakGene(GRN, outputFolder, gene.types = plotGeneTypes, useFiltered = FALSE, forceRerun= forceRerun)
2346
-
2351
+    
2347 2352
   }
2348 2353
   
2349 2354
   GRN
2350
-   
2351
-
2355
+  
2356
+  
2352 2357
   
2353 2358
 }
2354 2359
 
... ...
@@ -2497,7 +2502,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2497 2502
     nCores = 1
2498 2503
   }
2499 2504
   
2500
-
2505
+  
2501 2506
   #consensusPeaks = .createDF_fromCoordinates(getCounts(GRN, type = "peaks", norm = TRUE, 0)$peakID, "peakID")
2502 2507
   
2503 2508
   consensusPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!isFiltered)
... ...
@@ -2505,7 +2510,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2505 2510
   if (!is.null(TADs)) {
2506 2511
     
2507 2512
     futile.logger::flog.info(paste0("Integrate Hi-C data and overlap peaks and HI-C domains"))  
2508
-
2513
+    
2509 2514
     # Check format
2510 2515
     checkmate::assertCharacter(TADs$X1)
2511 2516
     checkmate::assertIntegerish(TADs$X2, lower = 1)
... ...
@@ -2518,7 +2523,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2518 2523
     } else {
2519 2524
       colnames(TADs)[4] = "ID"
2520 2525
     }
2521
-
2526
+    
2522 2527
     
2523 2528
     # Construct GRanges
2524 2529
     query   = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
... ...
@@ -2549,10 +2554,10 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2549 2554
     
2550 2555
     # Finally, do the actual overlaps
2551 2556
     overlapsAll = GenomicRanges::findOverlaps(query, subject, 
2552
-                               minoverlap = 1,
2553
-                               type = "any",
2554
-                               select = "all",
2555
-                               ignore.strand = TRUE)
2557
+                                              minoverlap = 1,
2558
+                                              type = "any",
2559
+                                              select = "all",
2560
+                                              ignore.strand = TRUE)
2556 2561
     
2557 2562
     
2558 2563
     query_row_ids  = S4Vectors::queryHits(overlapsAll)
... ...
@@ -2613,7 +2618,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2613 2618
   }
2614 2619
   
2615 2620
   # ITERATE THROUGH ALL PEAK_GENE PAIRS
2616
-
2621
+  
2617 2622
   permIndex = as.character(perm)
2618 2623
   
2619 2624
   countsPeaks.clean = dplyr::select(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE), -peakID)
... ...
@@ -2641,7 +2646,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2641 2646
     nCores = 1
2642 2647
   }
2643 2648
   
2644
-
2649
+  
2645 2650
   res.l = .execInParallelGen(nCores, returnAsList = TRUE, listNames = NULL, iteration = 0:startIndexMax, verbose = FALSE, functionName = .correlateData, 
2646 2651
                              chunksize = chunksize, maxRow = maxRow, 
2647 2652
                              counts1 = countsPeaks.clean, counts2 = countsRNA.clean, map1 = map_peaks, map2 = map_rna, 
... ...
@@ -2671,20 +2676,20 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
2671 2676
                               dplyr::left_join(overlaps.sub.filt.df, by = c("gene.ENSEMBL", "peak.ID")) %>%
2672 2677
                               # Integrate TAD IDs also
2673 2678
                               dplyr::left_join(dplyr::select(peak.TADs.df, peak.ID, tad.ID), by = c("peak.ID")) %>%
2674
-
2679
+                              
2675 2680
                               dplyr::select(tidyselect::all_of(selectColumns))) %>%
2676 2681
     dplyr::mutate(peak.ID = as.factor(peak.ID),
2677 2682
                   gene.ENSEMBL = as.factor(gene.ENSEMBL), 
2678 2683
                   tad.ID = as.factor(tad.ID)) %>%
2679 2684
     dplyr::rename(peak_gene.r = r, 
2680 2685
                   peak_gene.p_raw = p.raw)
2681
-    
2686
+  
2682 2687
   if (addRobustRegression) {
2683