Browse code

Documentation updates

Christian Arnold authored on 29/11/2022 17:53:49
Showing1 changed files
... ...
@@ -685,7 +685,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactors
685 685
                   peak.ID = query$peakID,
686 686
                   peak.GC.class = cut(.data$`G|C`, breaks = seq(0,1,1/nBins), include.lowest = TRUE, ordered_result = TRUE)) %>%
687 687
     dplyr::rename(peak.GC.perc    = .data$`G|C`) %>%
688
-    dplyr::select(peak.ID, everything())
688
+    dplyr::select(.data$peak.ID, tidyselect::everything())
689 689
   
690 690
 
691 691
   .printExecutionTime(start)
... ...
@@ -819,7 +819,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactors
819 819
             
820 820
             futile.logger::flog.info(paste0("  Using the csaw-derived TMM-derived normalization factors as size factors, overriding the DESeq-default size factors."))
821 821
             
822
-            sizeFactors(dd) <- sizeFactors
822
+            DESeq2::sizeFactors(dd) <- sizeFactors
823 823
         }
824 824
         
825 825
         dataNorm = DESeq2::counts(dd, normalized=TRUE)
... ...
@@ -877,8 +877,8 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactors
877 877
         # (2) effects related to between-lane distributional differences, e.g., sequencing depth. 
878 878
         # Accordingly, withinLaneNormalization and betweenLaneNormalization adjust for the first and second type of effects, respectively. 
879 879
         # We recommend to normalize for within-lane effects prior to between-lane normalization.
880
-        dataWithin <- withinLaneNormalization(data, y = peaks_GC_fraction, which= withinLane_method, num.bins = nBins, round = roundResults)
881
-        dataNorm <- betweenLaneNormalization(dataWithin, which=betweenLane_method, round = roundResults)
880
+        dataWithin <- EDASeq::withinLaneNormalization(data, y = peaks_GC_fraction, which= withinLane_method, num.bins = nBins, round = roundResults)
881
+        dataNorm <- EDASeq::betweenLaneNormalization(dataWithin, which=betweenLane_method, round = roundResults)
882 882
         
883 883
         
884 884
     } else if (normalization == "gcqn_peaks") {
Browse code

Bugfixes

Christian Arnold authored on 29/11/2022 12:33:46
Showing1 changed files
... ...
@@ -140,7 +140,7 @@ initializeGRN <- function(objectMetadata = list(),
140 140
 #' @param counts_peaks Data frame. No default. Counts for the peaks, with raw or normalized counts for each peak (rows) across all samples (columns). 
141 141
 #' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_peaks} below. 
142 142
 #' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.
143
-#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactor}. Normalization procedure for peak data. 
143
+#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactors}. Normalization procedure for peak data. 
144 144
 #' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, 
145 145
 #' \code{EDASeq_GC_peaks}, \code{gcqn_peaks}, \code{DESeq2_sizeFactors}, \code{none}.
146 146
 #' @param idColumn_peaks Character. Default \code{peakID}. Name of the column in the counts_peaks data frame that contains peak IDs. 
... ...
@@ -149,7 +149,7 @@ initializeGRN <- function(objectMetadata = list(),
149 149
 #' @param counts_rna Data frame. No default. Counts for the RNA-seq data, with raw or normalized counts for each gene (rows) across all samples (columns). 
150 150
 #' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_rna} below. 
151 151
 #' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.
152
-#' @param normalization_rna Character. Default \code{quantile}. Normalization procedure for peak data. 
152
+#' @param normalization_rna Character. Default \code{limma_quantile}. Normalization procedure for peak data. 
153 153
 #' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, \code{DESeq2_sizeFactors}, \code{none}.
154 154
 #' @param idColumn_RNA Character. Default \code{ENSEMBL}. Name of the column in the \code{counts_rna} data frame that contains Ensembl IDs.
155 155
 #' @param sampleMetadata Data frame. Default \code{NULL}. Optional, additional metadata for the samples, such as age, sex, gender etc. 
... ...
@@ -176,8 +176,8 @@ initializeGRN <- function(objectMetadata = list(),
176 176
 #' # We omit sampleMetadata = meta.df in the following line, becomes too long otherwise
177 177
 #' # GRN = addData(GRN, counts_peaks = peaks.df, counts_rna = rna.df, forceRerun = FALSE)
178 178
 
179
-addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactor", idColumn_peaks = "peakID", 
180
-                    counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
179
+addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactors", idColumn_peaks = "peakID", 
180
+                    counts_rna, normalization_rna = "limma_quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
181 181
                     additionalParams.l = list(),
182 182
                     allowOverlappingPeaks= FALSE,
183 183
                     keepOriginalReadCounts = FALSE,
... ...
@@ -389,10 +389,15 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactor"
389 389
     futile.logger::flog.info(paste0("Adding peak and gene annotation..."))
390 390
     
391 391
     GRN = .populatePeakAnnotation(GRN)
392
-    GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") 
393 392
     
394
-    # Additional GC statistics, not used at the moment currently
395
-    GRN = .calcAdditionalGCStatistics(GRN, GC.data.df)
393
+    if (normalization_peaks %in% c("EDASeq_GC_peaks", "gcqn_peaks")) {
394
+        GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") 
395
+        
396
+        # Additional GC statistics, not used at the moment currently
397
+        GRN = .calcAdditionalGCStatistics(GRN, GC.data.df)
398
+    }
399
+    
400
+
396 401
 
397 402
     GRN = .populateGeneAnnotation(GRN)
398 403
     
Browse code

reduced package burden

Christian Arnold authored on 22/11/2022 10:58:28
Showing1 changed files
... ...
@@ -36,7 +36,6 @@ initializeGRN <- function(objectMetadata = list(),
36 36
   checkmate::assert(checkmate::checkNull(objectMetadata), checkmate::checkList(objectMetadata))
37 37
   checkmate::assertChoice(genomeAssembly, c("hg19","hg38", "mm9", "mm10"))
38 38
 
39
-  .checkAndLoadPackagesGenomeAssembly(genomeAssembly)
40 39
   
41 40
   # Create the folder first if not yet existing
42 41
   checkmate::assertCharacter(outputFolder, min.chars = 1, len = 1)
... ...
@@ -123,7 +122,7 @@ initializeGRN <- function(objectMetadata = list(),
123 122
   GRN
124 123
 }
125 124
 
126
-######## Add and filter data ########
125
+######## Add data and annotation ########
127 126
 
128 127
 #' Add data to a \code{\linkS4class{GRN}} object.
129 128
 #' 
... ...
@@ -132,14 +131,18 @@ initializeGRN <- function(objectMetadata = list(),
132 131
 #' 
133 132
 #' If the \code{ChIPseeker} package is installed, additional peak annotation is provided in the annotation slot and a peak annotation QC plot is produced as part of peak-gene QC.
134 133
 #' This is fully optional, however, and has no consequences for downstream functions.
134
+#' Normalizing the data sensibly is very important. When \code{quantile}is chose, \code{limma::normalizeQuantiles} is used, which in essence does the following: 
135
+#' Each quantile of each column is set to the mean of that quantile across arrays. The intention is to make all the normalized columns have the same empirical distribution. 
136
+#' This will be exactly true if there are no missing values and no ties within the columns: the normalized columns are then simply permutations of one another.
135 137
 #' 
136 138
 #' @export
137 139
 #' @template GRN 
138 140
 #' @param counts_peaks Data frame. No default. Counts for the peaks, with raw or normalized counts for each peak (rows) across all samples (columns). 
139 141
 #' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_peaks} below. 
140 142
 #' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.
141
-#' @param normalization_peaks Character. Default \code{DESeq_sizeFactor}. 
142
-#' Normalization procedure for peak data. Must be one of \code{DESeq_sizeFactor}, \code{none}, or \code{quantile}.
143
+#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactor}. Normalization procedure for peak data. 
144
+#' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, 
145
+#' \code{EDASeq_GC_peaks}, \code{gcqn_peaks}, \code{DESeq2_sizeFactors}, \code{none}.
143 146
 #' @param idColumn_peaks Character. Default \code{peakID}. Name of the column in the counts_peaks data frame that contains peak IDs. 
144 147
 #' The required format must be {chr}:{start}-{end}", with {chr} denoting the abbreviated chromosome name, and {start} and {end} the begin and end 
145 148
 #' of the peak coordinates, respectively. End must be bigger than start. Examples for valid peak IDs are \code{chr1:400-800} or \code{chrX:20-25}.
... ...
@@ -147,12 +150,19 @@ initializeGRN <- function(objectMetadata = list(),
147 150
 #' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_rna} below. 
148 151
 #' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.
149 152
 #' @param normalization_rna Character. Default \code{quantile}. Normalization procedure for peak data. 
150
-#' Must be one of "DESeq_sizeFactor", "none", or "quantile". For "quantile", \code{limma::normalizeQuantiles} is used for normalization.
153
+#' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, \code{DESeq2_sizeFactors}, \code{none}.
151 154
 #' @param idColumn_RNA Character. Default \code{ENSEMBL}. Name of the column in the \code{counts_rna} data frame that contains Ensembl IDs.
152 155
 #' @param sampleMetadata Data frame. Default \code{NULL}. Optional, additional metadata for the samples, such as age, sex, gender etc. 
153 156
 #' If provided, the @seealso [plotPCA_all()] function can then incorporate and plot it. Sample names must match with those from both peak and RNA-Seq data. The first column is expected to contain the sample IDs, the actual column name is irrelevant.
157
+#' @param additionalParams.l Named list. Default \code{list()}. Additional parameters for the chosen normalization method. 
158
+#' Currently, only the GC-aware normalization methods \code{EDASeq_GC_peaks} and \code{gcqn_peaks} are supported here. 
159
+#' Both support the parameters \code{roundResults} (logical flag, \code{TRUE} or \code{FALSE}) and \code{nBins} (Integer > 0), and \code{EDASeq_GC_peaks} supports three additional parameters:
160
+#' \code{withinLane_method} (one of: "loess","median","upper","full") and \code{betweenLane_method}  (one of: "median","upper","full"). 
161
+#' For more information, see the EDASeq vignette.
154 162
 #' @param allowOverlappingPeaks \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should overlapping peaks be allowed (then only a warning is issued 
155 163
 #' when overlapping peaks are found) or (the default) should an error be raised?
164
+#' @param keepOriginalReadCounts \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should the original read counts as provided to the function be kept in addition to
165
+#' storing the rad counts after a (if any) normalization? This increases the memory footprint of the object because 2 additional count matrices have to be stored.
156 166
 #' @template forceRerun
157 167
 #' @return An updated \code{\linkS4class{GRN}} object, with added data from this function (e.g., slots \code{GRN@data$peaks} and \code{GRN@data$RNA})
158 168
 #' @seealso \code{\link{plotPCA_all}}
... ...
@@ -166,11 +176,11 @@ initializeGRN <- function(objectMetadata = list(),
166 176
 #' # We omit sampleMetadata = meta.df in the following line, becomes too long otherwise
167 177
 #' # GRN = addData(GRN, counts_peaks = peaks.df, counts_rna = rna.df, forceRerun = FALSE)
168 178
 
169
-# TODO: add a isSingleCell argument or something similar
170
-
171
-addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", idColumn_peaks = "peakID", 
179
+addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactor", idColumn_peaks = "peakID", 
172 180
                     counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
181
+                    additionalParams.l = list(),
173 182
                     allowOverlappingPeaks= FALSE,
183
+                    keepOriginalReadCounts = FALSE,
174 184
                     forceRerun = FALSE) {
175 185
   
176 186
   start = Sys.time()
... ...
@@ -182,8 +192,20 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
182 192
   checkmate::assertDataFrame(counts_rna, min.rows = 1, min.cols = 2)
183 193
   checkmate::assertChoice(idColumn_peaks, colnames(counts_peaks))
184 194
   checkmate::assertChoice(idColumn_RNA, colnames(counts_rna))
185
-  checkmate::assertChoice(normalization_peaks, c("none", "DESeq_sizeFactor", "quantile"))
186
-  checkmate::assertChoice(normalization_rna, c("none", "DESeq_sizeFactor", "quantile"))  
195
+
196
+  checkmate::assertChoice(normalization_peaks, 
197
+                          choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", 
198
+                                      "csaw_cyclicLoess_orig", "csaw_TMM", 
199
+                                      "EDASeq_GC_peaks", "gcqn_peaks",
200
+                                      "DESeq2_sizeFactors", 
201
+                                      "none"))
202
+  checkmate::assertChoice(normalization_rna, 
203
+                          choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", 
204
+                                      "csaw_cyclicLoess_orig", "csaw_TMM", 
205
+                                      "DESeq2_sizeFactors", 
206
+                                      "none"))
207
+  
208
+  checkmate::assertFlag(keepOriginalReadCounts)
187 209
   checkmate::assertFlag(allowOverlappingPeaks)
188 210
   checkmate::assertFlag(forceRerun)
189 211
   
... ...
@@ -248,29 +270,49 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
248 270
       # dplyr::summarise_if(is.numeric, sum, .groups = 'drop') # the .drop caused an error with dplyr 1.0.5
249 271
     }
250 272
     
273
+    ## STORE COUNTS METADATA ##
251 274
     
252
-    # Normalize counts
253
-    countsPeaks.norm.df  = .normalizeCounts(counts_peaks, method = normalization_peaks, idColumn = "peakID")
254
-    countsRNA.norm.df  = .normalizeCounts(counts_rna, method = normalization_rna, idColumn = "ENSEMBL")
275
+    GRN@data$RNA$counts_metadata = tibble::tibble(ID = counts_rna$ENSEMBL, isFiltered = FALSE)
255 276
     
256
-    GRN@config$parameters$normalization_rna = normalization_rna
257
-    GRN@config$parameters$normalization_peaks = normalization_peaks
277
+    GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(counts_peaks$peakID) 
278
+    stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata))
258 279
     
259
-    # We have our first connection type, the default one; more may be added later
260
-    GRN@config$TF_peak_connectionTypes = "expression"
280
+    #  Calculate GC content of peaks which we need before any normalization
261 281
     
262
-    # Make sure ENSEMBL is the first column
263
-    countsRNA.norm.df = dplyr::select(countsRNA.norm.df, .data$ENSEMBL, tidyselect::everything())
264
-    countsPeaks.norm.df = dplyr::select(countsPeaks.norm.df, .data$peakID, tidyselect::everything())
282
+    if (normalization_peaks %in% c("EDASeq_GC_peaks", "gcqn_peaks")) {
283
+        
284
+        # Now we need the genome annotation packages to calculate the GC content of the peak regions
285
+        .checkAndLoadPackagesGenomeAssembly(GRN@config$parameters$genomeAssembly)
286
+        
287
+        GC.data.df = .calcGCContentPeaks(GRN)
288
+        additionalParams.l[["GC_data"]] = GC.data.df
289
+    }
290
+   
265 291
     
266
-    samples_rna  = colnames(countsRNA.norm.df)
267
-    samples_peaks =  colnames(countsPeaks.norm.df)
268
-    allSamples =  unique(c(samples_rna, samples_peaks)) %>% setdiff(c("ENSEMBL", "isFiltered", "peakID"))
292
+    ## Normalize counts ##
293
+    countsPeaks.norm.df  = .normalizeCountMatrix(GRN, counts_peaks %>% tibble::column_to_rownames("peakID") %>% as.matrix(), 
294
+                                                 normalization = normalization_peaks,
295
+                                                 additionalParams = additionalParams.l
296
+                                            ) %>%
297
+                            as_tibble(rownames = "peakID") %>%
298
+                            dplyr::select(.data$peakID, tidyselect::everything())
269 299
     
270
-    # Subset data to retain only samples that appear in both RNA and peaks
271
-    data.l = .intersectData(countsRNA.norm.df, countsPeaks.norm.df)
300
+    countsRNA.norm.df    = .normalizeCountMatrix(GRN, counts_rna %>% tibble::column_to_rownames("ENSEMBL") %>% as.matrix(), 
301
+                                                 normalization = normalization_rna,
302
+                                                 additionalParams = additionalParams.l) %>%
303
+                            as_tibble(rownames = "ENSEMBL") %>%
304
+                            dplyr::select(.data$ENSEMBL, tidyselect::everything())
272 305
     
273
-    # Generate metadata first to determine the nmberof shared samples etc
306
+
307
+    ## SAMPLE AND GRN METADATA ##
308
+    GRN@config$parameters$normalization_rna = normalization_rna
309
+    GRN@config$parameters$normalization_peaks = normalization_peaks
310
+    
311
+    samples_rna   = colnames(countsRNA.norm.df)
312
+    samples_peaks = colnames(countsPeaks.norm.df)
313
+    allSamples    = unique(c(samples_rna, samples_peaks)) %>% setdiff(c("ENSEMBL", "isFiltered", "peakID"))
314
+    
315
+    # Generate metadata first to determine the number of shared samples etc
274 316
     if (!is.null(sampleMetadata)) {
275 317
       
276 318
       futile.logger::flog.info("Parsing provided metadata...")
... ...
@@ -308,23 +350,30 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
308 350
       )
309 351
     
310 352
     GRN@config$sharedSamples = dplyr::filter(GRN@data$metadata, .data$has_both) %>% dplyr::pull(.data$sampleID) %>% as.character()
353
+    # We have our first connection type, the default one; more may be added later
354
+    GRN@config$TF_peak_connectionTypes = "expression"
311 355
     
312
-    ### COUNT MATRICES
313
-    # Store the matrices either as normal or sparse matrix
356
+    ## STORE FINAL COUNT MATRICES ##
357
+    
358
+    # Subset data to retain only samples that appear in both RNA and peaks
359
+    data.l = .intersectData(countsRNA.norm.df, countsPeaks.norm.df)
314 360
     
361
+    # Store the matrices either as normal or sparse matrix
315 362
     GRN@data$peaks$counts = .storeAsMatrixOrSparseMatrix(GRN, df = data.l[["peaks"]], ID_column = "peakID", slotName = "GRN@data$peaks$counts")
316
-
317
-    # includeFiltered = TRUE here as it doesnt make a difference and because getCounts requires counts_metadata$isFiltered to be set already
318
-    GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(rownames(GRN@data$peaks$counts)) 
319
-    stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata))
320 363
     
364
+    if (keepOriginalReadCounts) {
365
+        GRN@data$peaks$counts_raw = .storeAsMatrixOrSparseMatrix(GRN, df = counts_peaks %>% dplyr::select("peakID", tidyselect::one_of(GRN@config$sharedSamples)), 
366
+                                                                 ID_column = "peakID", slotName = "GRN@data$peaks$counts_raw")
367
+    }
368
+
321 369
     GRN@data$RNA$counts   = .storeAsMatrixOrSparseMatrix(GRN, df =  data.l[["RNA"]], ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts")
322
-      
323
-    GRN@data$RNA$counts_metadata = tibble::tibble(ID = data.l[["RNA"]]$ENSEMBL, isFiltered = FALSE)
324
-    
325
-    GRN@data$RNA$counts_permuted_index = sample.int(ncol(GRN@data$RNA$counts), ncol(GRN@data$RNA$counts))
326 370
     
371
+    if (keepOriginalReadCounts) {
372
+        GRN@data$RNA$counts_raw = .storeAsMatrixOrSparseMatrix(GRN, df = counts_rna %>% dplyr::select("ENSEMBL", tidyselect::one_of(GRN@config$sharedSamples)), 
373
+                                                                 ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts_raw")
374
+    }
327 375
     
376
+    GRN@data$RNA$counts_permuted_index = sample.int(ncol(GRN@data$RNA$counts), ncol(GRN@data$RNA$counts))
328 377
     
329 378
     futile.logger::flog.info(paste0( "Final dimensions of data:"))
330 379
     futile.logger::flog.info(paste0( " RNA  : ", nrow(countsRNA.norm.df)  , " x ", ncol(countsRNA.norm.df)   - 2, " (rows x columns)"))
... ...
@@ -332,45 +381,23 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
332 381
     # Create permutations for RNA
333 382
     futile.logger::flog.info(paste0( "Generate ", .getMaxPermutation(GRN), " permutations of RNA-counts"))
334 383
     
384
+    futile.logger::flog.info(paste0("Check for overlapping peaks..."))
335 385
     
336
-    futile.logger::flog.info(paste0("Creating consensus peaks and check for overlapping peaks..."))
337
-    
338
-    # Consensus peaks
339
-
340
-    # Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details
341
-    consensus.gr   = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE)
386
+    .checkOverlappingPeaks(GRN, allowOverlappingPeaks = allowOverlappingPeaks)
342 387
 
343
-    overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) >1)
388
+    ## PEAK AND GENE ANNOTATION ##
389
+    futile.logger::flog.info(paste0("Adding peak and gene annotation..."))
344 390
     
345
-    if (length(overlappingPeaks) > 0) {
346
-      
347
-      ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID
348
-      
349
-      messageAll = paste0(" ", length(overlappingPeaks), 
350
-                          " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","),
351
-                          ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. "
352
-      )
353
-      
354
-      
355
-      if (allowOverlappingPeaks) {
356
-        
357
-        message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.")
358
-        .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
359
-      } else {
360
-        message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function")
361
-        .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
362
-      }
363
-      
364
-    }
391
+    GRN = .populatePeakAnnotation(GRN)
392
+    GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") 
393
+    
394
+    # Additional GC statistics, not used at the moment currently
395
+    GRN = .calcAdditionalGCStatistics(GRN, GC.data.df)
396
+
397
+    GRN = .populateGeneAnnotation(GRN)
365 398
     
366 399
   }
367
-  futile.logger::flog.info(paste0("Adding peak and gene annotation..."))
368 400
   
369
-  # Add peak annotation once
370
-  GRN = .populatePeakAnnotation(GRN)
371
-  
372
-  # Add gene annotation once
373
-  GRN = .populateGeneAnnotation(GRN)
374 401
   
375 402
   .printExecutionTime(start, prefix = "")
376 403
   
... ...
@@ -378,6 +405,36 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
378 405
   
379 406
 }
380 407
 
408
+.checkOverlappingPeaks <- function (GRN, allowOverlappingPeaks) {
409
+    
410
+    # Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details
411
+    consensus.gr   = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE)
412
+    
413
+    overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) >1)
414
+    
415
+    if (length(overlappingPeaks) > 0) {
416
+        
417
+        ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID
418
+        
419
+        messageAll = paste0(" ", length(overlappingPeaks), 
420
+                            " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","),
421
+                            ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. "
422
+        )
423
+        
424
+        
425
+        if (allowOverlappingPeaks) {
426
+            
427
+            message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.")
428
+            .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
429
+        } else {
430
+            message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function")
431
+            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
432
+        }
433
+        
434
+    }
435
+}
436
+
437
+
381 438
 .storeAsMatrixOrSparseMatrix <- function (GRN, df, ID_column, slotName, threshold = 0.1) {
382 439
     
383 440
     stopifnot(identical(GRN@config$sharedSamples, colnames(df)[-1]))
... ...
@@ -497,11 +554,18 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
497 554
   
498 555
   GRN@annotation$peaks = metadata_peaks
499 556
 
500
-  if (!is.installed("ChIPseeker")) {
501
-      packageMessage = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", 
502
-                              " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.")
503
-      .checkPackageInstallation("ChIPseeker", packageMessage, isWarning = TRUE)
557
+  if (!is.installed("ChIPseeker") | !.checkAndLoadPackagesGenomeAssembly(GRN@config$parameters$genomeAssembly, returnLogical = TRUE)) {
558
+      if (!is.installed("ChIPseeker")) {
559
+          packageMessage = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", 
560
+                                  " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.")
561
+          .checkPackageInstallation("ChIPseeker", packageMessage, isWarning = TRUE)
562
+      } else {
563
+          
564
+          # annotation packages missing, message will already been thrown
565
+      }
566
+      
504 567
   } else {
568
+      
505 569
     
506 570
     futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while"))
507 571
     genomeAssembly = GRN@config$parameters$genomeAssembly
... ...
@@ -555,12 +619,6 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
555 619
     
556 620
   }
557 621
   
558
-  
559
-  
560
-  
561
-  # Also add GC content as annotation columns
562
-  GRN = .calcGCContentPeaks(GRN)
563
-  
564 622
   GRN
565 623
   
566 624
 }
... ...
@@ -600,7 +658,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
600 658
 }
601 659
 
602 660
 #' @importFrom rlang .data `:=`
603
-.calcGCContentPeaks <- function(GRN) {
661
+.calcGCContentPeaks <- function(GRN, nBins = 10) {
604 662
   
605 663
   futile.logger::flog.info(paste0("Calculate GC-content for peaks. This may take a while"))
606 664
   start = Sys.time()
... ...
@@ -616,58 +674,429 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
616 674
   # Get DNAStringSet object
617 675
   seqs_peaks = Biostrings::getSeq(genome, query)
618 676
   
619
-  GC_content.df = Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT")
620
-  
621
-  GC_content.df = GC_content.df %>%
677
+  GC_content.df = (Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT")) %>%
622 678
     tibble::as_tibble() %>%
623
-    dplyr::mutate(length = GenomicRanges::width(query),
679
+    dplyr::mutate(peak.width = GenomicRanges::width(query),
624 680
                   peak.ID = query$peakID,
625
-                  GC_class = cut(.data$`G|C`, breaks = seq(0,1,0.1), include.lowest = TRUE, ordered_result = TRUE))
626
-  
627
-  GC_classes.df = GC_content.df %>%
628
-    dplyr::group_by(.data$GC_class) %>%
629
-    dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(length), peak_width_sd = sd(length)) %>%
630
-    dplyr::ungroup() %>% 
631
-    tidyr::complete(.data$GC_class, fill = list(n = 0)) %>%
632
-    dplyr::mutate(n_rel = .data$n / length(query))
633
-  
634
-  # TODO: Put where
635
-  #ggplot2::ggplot(GC_content.df, ggplot2::aes(GC.class)) + geom_histogram(stat = "count") + ggplot2::theme_bw()
636
-  
637
-  #ggplot2::ggplot(GC_classes.df , ggplot2::aes(GC.class, n_rel)) + geom_bar(stat = "identity") + ggplot2::theme_bw()
638
-  
639
-  GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC_content.df, by = "peak.ID") %>%
640
-    dplyr::rename( peak.GC.perc    = .data$`G|C`,
641
-                   peak.width      = .data$length,
642
-                   peak.GC.class   = .data$GC_class)
643
-  
644
-  GRN@stats$peaks = list()
645
-  GRN@stats$peaks$GC = GC_classes.df
681
+                  peak.GC.class = cut(.data$`G|C`, breaks = seq(0,1,1/nBins), include.lowest = TRUE, ordered_result = TRUE)) %>%
682
+    dplyr::rename(peak.GC.perc    = .data$`G|C`) %>%
683
+    dplyr::select(peak.ID, everything())
646 684
   
685
+
647 686
   .printExecutionTime(start)
648 687
   
649
-  GRN
688
+  GC_content.df
689
+}
690
+
691
+.calcAdditionalGCStatistics <- function (GRN, GC.data) {
692
+    
693
+    GC_classes.df = GC.data %>%
694
+        dplyr::group_by(.data$peak.GC.class) %>%
695
+        dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(.data$peak.width), peak_width_sd = sd(.data$peak.width)) %>%
696
+        dplyr::ungroup() %>% 
697
+        tidyr::complete(.data$peak.GC.class, fill = list(n = 0)) %>%
698
+        dplyr::mutate(n_rel = .data$n / length(query))
699
+    
700
+    # TODO: Put where
701
+    #ggplot2::ggplot(GC.data, ggplot2::aes(GC.class)) + geom_histogram(stat = "count") + ggplot2::theme_bw()
702
+    
703
+    #ggplot2::ggplot(GC_classes.df , ggplot2::aes(GC.class, n_rel)) + geom_bar(stat = "identity") + ggplot2::theme_bw()
704
+    
705
+    
706
+    GRN@stats$peaks = list()
707
+    GRN@stats$peaks$GC = GC_classes.df
708
+    
709
+    GRN
650 710
 }
651 711
 
712
+
713
+####### FILTER AND NORMALIZE DATA ############
714
+
715
+.normalizeCountMatrix <- function(GRN, data, normalization, additionalParams =list()) {
716
+    
717
+    checkmate::assertMatrix(data)
718
+    checkmate::assertChoice(normalization, 
719
+                            choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", 
720
+                                        "csaw_cyclicLoess_orig", "csaw_TMM", 
721
+                                        "EDASeq_GC_peaks", "gcqn_peaks",
722
+                                        "DESeq2_sizeFactors",
723
+                                        "none"))
724
+    
725
+    # Create a DESeq2 object
726
+    if (normalization ==  "DESeq2_sizeFactors" | normalization == "csaw_cyclicLoess_orig" | normalization == "csaw_TMM") {
727
+
728
+        dd <- suppressMessages(DESeq2::DESeqDataSetFromMatrix(countData = data,
729
+                                             colData = data.frame( sampleID = colnames(data)),
730
+                                             design = stats::as.formula(" ~ 1")))
731
+    }
732
+    
733
+    # Common parameters
734
+    if (normalization ==  "EDASeq_GC_peaks" | normalization == "gcqn_peaks") {
735
+        
736
+        # After either within-lane or between-lane normalization, the expression values are not counts anymore. 
737
+        # However, their distribution still shows some typical features of counts distribution (e.g., the variance depends on the mean). 
738
+        # Hence, for most applications, it is useful to round the normalized values to recover count-like values, which we refer to as “pseudo-counts”.
739
+        # By default, both withinLaneNormalization and betweenLaneNormalization round the normalized values to the closest integer. 
740
+        # This behavior can be changed by specifying round=FALSE. This gives the user more flexibility and assures that rounding approximations do not affect subsequent computations (e.g., recovering the offset from the normalized counts).
741
+        if ("roundResults" %in% names(additionalParams)) {
742
+            roundResults = additionalParams$roundResults
743
+        } else {
744
+            roundResults = FALSE
745
+        }
746
+        
747
+        checkmate::assertFlag(roundResults)
748
+        
749
+        if ("GC_data" %in% names(additionalParams)) {
750
+            GC_data.df = additionalParams$GC_data
751
+        } else {
752
+            message = "GC_data is missing in additionalParams list."
753
+            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
754
+        }
755
+
756
+    }
757
+    
758
+    if (normalization == "limma_cyclicloess" | normalization == "limma_quantile" | normalization == "limma_scale") {
759
+        
760
+        futile.logger::flog.info(paste0(" Normalizing data using the package limma with the following method: ", normalization))
761
+
762
+        dataNorm = limma::normalizeBetweenArrays(data, method= sub("limma_", replacement = "", normalization))  
763
+        
764
+    } else if (normalization == "csaw_cyclicLoess_orig" | normalization == "csaw_TMM")   {
765
+        
766
+        futile.logger::flog.info(paste0(" Normalizing data using the package csaw with the following method: ", normalization))
767
+        packageMessage = paste0("The package csaw is currently not installed, but however needed for the normalization methods \"csaw_cyclicLoess_orig\" and \"csaw_TMM\"")
768
+        .checkPackageInstallation("csaw", packageMessage, isWarning = TRUE)
769
+        
770
+        if (packageVersion("csaw") <= "1.14.1") {
771
+            message = "The version of the csaw package is too old, install at least version 1.14.1 or change the normalization method"
772
+            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
773
+        }
774
+
775
+        object = SummarizedExperiment::SummarizedExperiment(list(counts=data))
776
+        object$totals = colSums(data)
777
+        
778
+        if (normalization == "csaw_cyclicLoess_orig") {
779
+            
780
+            # Perform a cyclic loess normalization
781
+            # We use a slighlty more complicated setup to derive size factors for library normalization
782
+            # Instead of just determining the size factors in DeSeq2 via cirtual samples, we use 
783
+            # a normalization from the csaw package (see https://www.rdocumentation.org/packages/csaw/versions/1.6.1/topics/normOffsets)
784
+            # and apply a non-linear normalization. 
785
+            # For each sample, a lowess curve is fitted to the log-counts against the log-average count. 
786
+            # The fitted value for each bin pair is used as the generalized linear model offset for that sample. 
787
+            # The use of the average count provides more stability than the average log-count when low counts are present for differentially bound regions.
788
+            
789
+            # since counts returns,by default, non-normalized counts, the following code should be fine and there is no need to also run estimateSizeFactors beforehand
790
+ 
791
+            normFacs  = csaw::normOffsets(object, se.out = FALSE)
792
+            
793
+            # the normalization factors matrix should not have 0's in it
794
+            # it should have geometric mean near 1 for each row
795
+            # exp(mean(log(normFacs[i,]))) for each row i
796
+            normFacs <- normFacs / exp(rowMeans(log(normFacs)))
797
+            
798
+            rownames(normFacs) = rownames(data)
799
+            colnames(normFacs) = colnames(data)
800
+            
801
+            futile.logger::flog.info(paste0("  Using the csaw-derived feature-specific normalization factors for DESeq, which will preempt sizeFactors"))
802
+            
803
+            
804
+            DESeq2::normalizationFactors(dd) <- normFacs
805
+            
806
+        
807
+        } else { # TMM
808
+            
809
+            # This function uses the trimmed mean of M-values (TMM) method to remove composition biases, typically in background regions of the genome. 
810
+            # The key difference from standard TMM is that precision weighting is turned off by default so as to avoid upweighting high-abundance regions. 
811
+            # These are more likely to be bound and thus more likely to be differentially bound. 
812
+            # Assigning excessive weight to such regions will defeat the purpose of trimming when normalizing the coverage of background regions.
813
+            sizeFactors  = csaw::normFactors(object, se.out = FALSE)
814
+            
815
+            futile.logger::flog.info(paste0("  Using the csaw-derived TMM-derived normalization factors as size factors, overriding the DESeq-default size factors."))
816
+            
817
+            sizeFactors(dd) <- sizeFactors
818
+        }
819
+        
820
+        dataNorm = DESeq2::counts(dd, normalized=TRUE)
821
+        
822
+    } else if (normalization == "EDASeq_GC_peaks") {
823
+        
824
+        packageMessage = paste0("The package EDASeq is currently not installed, but however needed for the normalization methods \"EDASeq_GC_peaks\"")
825
+        .checkPackageInstallation("EDASeq", packageMessage, isWarning = TRUE)
826
+        
827
+        futile.logger::flog.info(paste0(" Normalizing data using the package EDASeq with the following method: ", normalization, " with the GC content as covariate."))
828
+        # https://bioconductor.org/packages/release/bioc/vignettes/EDASeq/inst/doc/EDASeq.html#normalization
829
+
830
+        if ("nBins" %in% names(additionalParams)) {
831
+            nBins = additionalParams$nBins
832
+        } else {
833
+            nBins = 10 
834
+        }
835
+
836
+        # We implemented four within-lane normalization methods, namely: 
837
+        # 1. loess robust local regression of read counts (log) on a gene feature such as GC-content (loess),
838
+        # 2. global-scaling between feature strata using the median (median), 
839
+        # 3. global-scaling between feature strata using the upper-quartile (upper), 
840
+        # 4. and full-quantile normalization between feature strata (full). 
841
+        # For a discussion of these methods in context of GC-content normalization see (Risso et al. 2011).
842
+        if ("withinLane_method" %in% names(additionalParams)) {
843
+            withinLane_method = additionalParams$withinLane_method
844
+        } else {
845
+            withinLane_method = "full"
846
+        }
847
+        
848
+        # Regarding between-lane normalization, the package implements three of the methods introduced in (Bullard et al. 2010): 
849
+        # global-scaling using the median (median), global-scaling using the upper-quartile (upper), and full-quantile normalization (full).
850
+        if ("betweenLane_method" %in% names(additionalParams)) {
851
+            betweenLane_method = additionalParams$withinLane_method
852
+        } else {
853
+            betweenLane_method = "full"
854
+        }
855
+        
856
+        peaks_GC_fraction = GC_data.df$peak.GC.perc
857
+ 
858
+        
859
+        futile.logger::flog.info(paste0(" Using the following additional parameters for EDASeq: nBins = ", nBins, 
860
+                                        ", withinLane_method = ", withinLane_method, 
861
+                                        ", betweenLane_method = ", betweenLane_method,
862
+                                        ", roundResults = ", roundResults,
863
+                                        " as well as the automatically calculated peak GC content as covariate for withinLaneNormalization"))
864
+        
865
+        checkmate::assertNumeric(peaks_GC_fraction, lower = 0, upper = 1)
866
+        checkmate::assertIntegerish(nBins, lower = 1, upper = 100)
867
+        checkmate::assertChoice(withinLane_method, choices = c("loess","median","upper","full"))
868
+        checkmate::assertChoice(betweenLane_method, choices = c("median","upper","full"))
869
+        
870
+        # Following (Risso et al. 2011), we consider two main types of effects on gene-level counts: 
871
+        # (1) within-lane gene-specific (and possibly lane-specific) effects, e.g., related to gene length or GC-content, and 
872
+        # (2) effects related to between-lane distributional differences, e.g., sequencing depth. 
873
+        # Accordingly, withinLaneNormalization and betweenLaneNormalization adjust for the first and second type of effects, respectively. 
874
+        # We recommend to normalize for within-lane effects prior to between-lane normalization.
875
+        dataWithin <- withinLaneNormalization(data, y = peaks_GC_fraction, which= withinLane_method, num.bins = nBins, round = roundResults)
876
+        dataNorm <- betweenLaneNormalization(dataWithin, which=betweenLane_method, round = roundResults)
877
+        
878
+        
879
+    } else if (normalization == "gcqn_peaks") {
880
+       
881
+        futile.logger::flog.info(paste0(" Normalizing data using the GC-full-quantile (GC-FQ) normalization approach as described in Van den Berge et al. 2021 (https://doi.org/10.1101/2021.01.26.428252) using 50 bins."))
882
+
883
+        if ("nBins" %in% names(additionalParams)) {
884
+            nBins = additionalParams$nBins
885
+        } else {
886
+            # Use the recommended 50 bins as described in the paper as default
887
+            nBins = 50
888
+        }
889
+
890
+        peaks_GC_class = cut(GC_data.df$peak.GC.perc, breaks = seq(0,1,1/nBins), include.lowest = TRUE, ordered_result = TRUE)
891
+
892
+        dataNorm = .gcqn(data = data, 
893
+                         GC_class = peaks_GC_class,
894
+                         summary='mean', round = roundResults)
895
+        
896
+        
897
+    } else if (normalization == "DESeq2_sizeFactors") {
898
+        
899
+        futile.logger::flog.info(paste0(" Normalizing data using the package DESeq2 with a standard size factor normalization."))
900
+        
901
+        dd = DESeq2::estimateSizeFactors(dd)
902
+        dataNorm = DESeq2::counts(dd, normalized=TRUE)
903
+        
904
+    } else if (normalization == "none") {
905
+        dataNorm = data
906
+        futile.logger::flog.info(paste0(" Skip normalization."))
907
+        # Nothing to do, leave countsPeaks as they are
908
+    }
909
+    
910
+    dataNorm
911
+}
912
+
913
+
914
+#The following functions are taken from https://github.com/koenvandenberge/bulkATACGC/blob/master/methods/gcqn_validated.R
915
+
916
+### GCQN, first implementation
917
+FQnorm <- function(counts, type="mean"){
918
+    rk <- apply(counts,2,rank,ties.method='min')
919
+    counts.sort <- apply(counts,2,sort)
920
+    if(type=="mean"){
921
+        # refdist <- apply(counts.sort,1,mean)
922
+        refdist <- base::rowMeans(counts.sort)
923
+    } else if(type=="median"){
924
+        #refdist <- apply(counts.sort,1,median)
925
+        refdist <- matrixStats::rowMedians(counts.sort)
926
+    }
927
+    norm <- apply(rk,2,function(r){ refdist[r] })
928
+    rownames(norm) <- rownames(counts)
929
+    return(norm)
930
+}
931
+
932
+
933
+# GC-full-quantile (GC-FQ) normalization. GC-FQ is similar to FQ-FQ, but relies on the observation that, in
934
+# ATAC-seq, read count distributions are often more comparable between samples within a GC-content bin, than
935
+# between GC-content bins within a sample (Figure 2). It therefore applies between-sample FQ normalization for
936
+# each GC-content bin separately
937
+.gcqn <- function(data, GC_class, summary='mean', round=FALSE){
938
+    
939
+    gcBinNormCounts <- matrix(NA, nrow=nrow(data), ncol=ncol(data), dimnames=list(rownames(data),colnames(data)))
940
+    
941
+    for(ii in 1:nlevels(GC_class)){
942
+  
943
+        id <- which(GC_class==levels(GC_class)[ii])
944
+        if(length(id) == 0) next
945
+        if(length(id) == 1){
946
+            normCountBin <- data[id,]
947
+            if(round) normCountBin <- round(normCountBin)
948
+            gcBinNormCounts[id,] <- normCountBin
949
+            next
950
+        }
951
+        countBin <- data[id,,drop=FALSE]
952
+        if(summary=="mean"){
953
+            normCountBin <- FQnorm(countBin, type='mean')
954
+        } else if(summary=="median"){
955
+            normCountBin <- FQnorm(countBin, type='median')
956
+        }
957
+        if(round) normCountBin <- round(normCountBin)
958
+        normCountBin[normCountBin<0] <- 0
959
+        gcBinNormCounts[id,] <- normCountBin
960
+    }
961
+    return(gcBinNormCounts)
962
+}
963
+
964
+
965
+# peaksAnnotation = GRN@annotation$peaks
966
+# counts = getCounts(GRN, type = "peaks", asMatrix = TRUE, includeFiltered = TRUE)
967
+# Currently not applicable as we do not have different groups, qsmooth does not run with only one group
968
+# 
969
+# .gcqn_qsmooth_mod <- function(counts, peaksAnnotation){
970
+#     
971
+#     packageMessage = paste0("The package qsmooth is not installed, which is however needed for the chosen normalization method. Please install it and re-run this function or change the normalization method.")
972
+#     .checkPackageInstallation("qsmooth", packageMessage)
973
+#     
974
+#     groups_factors = factor(rep("A", length(GRN@config$sharedSamples)))
975
+#     
976
+#     gcBinNormCounts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts), dimnames=list(rownames(counts),colnames(counts)))
977
+#     for(ii in 1:nlevels(peaksAnnotation$peak.GC.class)){
978
+#         id <- which(peaksAnnotation$peak.GC.class==levels(peaksAnnotation$peak.GC.class)[ii])
979
+#         countBin <- counts[id,]
980
+#         qs <- qsmooth::qsmooth(countBin, group_factor=groups_factors)
981
+#         normCountBin <- qs@qsmoothData
982
+#         normCountBin <- round(normCountBin)
983
+#         normCountBin[normCountBin<0] <- 0
984
+#         gcBinNormCounts[id,] <- normCountBin
985
+#     }
986
+#     return(gcBinNormCounts)
987
+# }
988
+
989
+
990
+
991
+
992
+
993
+
994
+# 
995
+# # Needed
996
+# # Add DESeq2 normalization factors maybe? csaw stuff
997
+# .normalizeCounts <- function(rawCounts, method = "quantile", ) {
998
+#     
999
+#     checkmate::assertChoice(idColumn, colnames(rawCounts))
1000
+#     start = Sys.time()
1001
+#     
1002
+#     futile.logger::flog.info(paste0("Normalize counts. Method: ", method, ", ID column: ", idColumn))
1003
+#     
1004
+#     
1005
+#     if (method == "quantile") {
1006
+#         
1007
+#         if (length(rmCols) > 0) {
1008
+#             input = as.matrix(rawCounts[,-rmCols])
1009
+#         } else {
1010
+#             input = as.matrix(rawCounts)
1011
+#         }
1012
+#         
1013
+#         # We use limma for normalizing quantiles and NOT preprocessCore as before due to regression bugs for version >1.50
1014
+#         counts.norm = limma::normalizeQuantiles(input)
1015
+#         
1016
+#     } else if (method == "DESeq_sizeFactor") {
1017
+#         
1018
+#         if (length(rmCols) > 0) {
1019
+#             sampleData.df = data.frame( sampleID = colnames(rawCounts)[-rmCols], stringsAsFactors = FALSE)
1020
+#             countDataNew = as.data.frame(rawCounts[, -rmCols])
1021
+#         } else {
1022
+#             sampleData.df = data.frame( sampleID = colnames(rawCounts))
1023
+#             countDataNew = as.data.frame(rawCounts)
1024
+#         }
1025
+#         
1026
+#         rownames(countDataNew) = ids
1027
+#         
1028
+#         stopifnot(identical(sampleData.df$sampleID, colnames(countDataNew)))
1029
+#         
1030
+#         dd <- DESeq2::DESeqDataSetFromMatrix(countData = countDataNew,
1031
+#                                              colData = sampleData.df,
1032
+#                                              design = stats::as.formula(" ~ 1"))
1033
+#         
1034
+#         dd = DESeq2::estimateSizeFactors(dd)
1035
+#         counts.norm = DESeq2::counts(dd, normalized = TRUE)
1036
+#         
1037
+#         if (returnDESeqObj) {
1038
+#             return(dd)
1039
+#         }
1040
+#         
1041
+#         
1042
+#     } else if (method == "none") {
1043
+#         
1044
+#         if (length(rmCols) > 0) {
1045
+#             counts.norm = rawCounts[,-rmCols]
1046
+#         } else {
1047
+#             counts.norm = rawCounts
1048
+#         }
1049
+#         
1050
+#     } else  {
1051
+#         stop("Not implemented yet")
1052
+#     }
1053
+#     
1054
+#     .printExecutionTime(start)
1055
+#     
1056
+#     counts.norm = counts.norm %>% 
1057
+#         as.data.frame()  %>% 
1058
+#         tibble::as_tibble() %>% 
1059
+#         dplyr::mutate({{idColumn}} := ids) %>%
1060
+#         dplyr::select({{idColumn}}, tidyselect::everything()) 
1061
+#     
1062
+#     colnames(counts.norm) = c(idColumn, colnames_samples)
1063
+#     
1064
+#     counts.norm
1065
+#     
1066
+# }
1067
+
1068
+
1069
+
652 1070
 #' Filter RNA-seq and/or peak data from a \code{\linkS4class{GRN}} object
653 1071
 #' 
654
-#' This function marks genes and/or peaks as \code{filtered} depending on the chosen filtering criteria. Filtered genes / peaks will then be 
655
-#' disregarded when adding connections in subsequent steps via \code{\link{addConnections_TF_peak}} and  \code{\link{addConnections_peak_gene}}. \strong{This function does NOT (re)filter existing connections when the \code{\linkS4class{GRN}} object already contains connections. Thus, upon re-execution of this function with different filtering criteria, all downstream steps have to be re-run.}
1072
+#' This function marks genes and/or peaks as \code{filtered} depending on the chosen filtering criteria and is based on the count data AFTER
1073
+#' potential normalization as chosen when using the \code{\link{addData}} function. Most of the filters may not be meaningful and useful anymore to apply
1074
+#' after using particular normalization schemes that can give rise to, for example, negative values such as cyclic loess normalization. If normalized counts do
1075
+#' not represents counts anymore but rather a deviation from a mean or something a like, the filtering critieria usually do not make sense anymore.
1076
+#' Filtered genes / peaks will then be disregarded when adding connections in subsequent steps via \code{\link{addConnections_TF_peak}} and  \code{\link{addConnections_peak_gene}}. \strong{This function does NOT (re)filter existing connections when the \code{\linkS4class{GRN}} object already contains connections. Thus, upon re-execution of this function with different filtering criteria, all downstream steps have to be re-run.}
656 1077
 #' 
657 1078
 #' All this function does is setting (or modifying) the filtering flag in \code{GRN@data$peaks$counts_metadata} and \code{GRN@data$RNA$counts_metadata}, respectively.
658 1079
 #' 
659 1080
 #' @template GRN 
660 1081
 #' @param minNormalizedMean_peaks Numeric[0,] or \code{NULL}. Default 5. Minimum mean across all samples for a peak to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
1082
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
661 1083
 #' @param maxNormalizedMean_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum mean across all samples for a peak to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
1084
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
662 1085
 #' @param minNormalizedMeanRNA Numeric[0,] or \code{NULL}. Default 5. Minimum mean across all samples for a gene to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
1086
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
663 1087
 #' @param maxNormalizedMeanRNA Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum mean across all samples for a gene to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter.
1088
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
664 1089
 #' @param chrToKeep_peaks Character vector or \code{NULL}. Default \code{NULL}. Vector of chromosomes that peaks are allowed to come from. This filter can be used to filter sex chromosomes from the peaks, for example (e.g, \code{c(paste0("chr", 1:22), "chrX", "chrY")})
665
-#' @param minSize_peaks Integer[1,] or \code{NULL}. Default \code{NULL}. Minimum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter.
1090
+#' @param minSize_peaks Integer[1,] or \code{NULL}. Default 20. Minimum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter.
666 1091
 #' @param maxSize_peaks Integer[1,] or \code{NULL}. Default 10000. Maximum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter.
667 1092
 #' @param minCV_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Minimum CV (coefficient of variation, a unitless measure of variation) for a peak to be retained. Set to \code{NULL} for not applying the filter.
1093
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
668 1094
 #' @param maxCV_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum CV (coefficient of variation, a unitless measure of variation) for a peak to be retained. Set to \code{NULL} for not applying the filter.
1095
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
669 1096
 #' @param minCV_genes Numeric[0,] or \code{NULL}. Default \code{NULL}. Minimum CV (coefficient of variation, a unitless measure of variation) for a gene to be retained. Set to \code{NULL} for not applying the filter.
1097
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
670 1098
 #' @param maxCV_genes Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum CV (coefficient of variation, a unitless measure of variation) for a gene to be retained. Set to \code{NULL} for not applying the filter.
1099
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function.
671 1100
 #' @template forceRerun
672 1101
 #' @return An updated \code{\linkS4class{GRN}} object, with added data from this function.
673 1102
 #' @examples 
... ...
@@ -676,10 +1105,10 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
676 1105
 #' GRN = filterData(GRN, forceRerun = FALSE)
677 1106
 #' @export
678 1107
 filterData <- function (GRN, 
679
-                        minNormalizedMean_peaks = 5, maxNormalizedMean_peaks = NULL, 
680
-                        minNormalizedMeanRNA = 1,  maxNormalizedMeanRNA = NULL,
1108
+                        minNormalizedMean_peaks = NULL, maxNormalizedMean_peaks = NULL, 
1109
+                        minNormalizedMeanRNA = NULL,  maxNormalizedMeanRNA = NULL,
681 1110
                         chrToKeep_peaks = NULL,
682
-                        minSize_peaks = NULL, maxSize_peaks = 10000,
1111
+                        minSize_peaks = 20, maxSize_peaks = 10000,
683 1112
                         minCV_peaks = NULL, maxCV_peaks = NULL,
684 1113
                         minCV_genes = NULL, maxCV_genes = NULL,
685 1114
                         forceRerun = FALSE) {
... ...
@@ -761,13 +1190,13 @@ filterData <- function (GRN,
761 1190
         .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
762 1191
     }
763 1192
     
764
-    rowMeans = rowMeans(getCounts(GRN, type = "rna", asMatrix = TRUE, includeFiltered = TRUE))
765
-    GRN@data$RNA$counts_metadata$isFiltered = rowMeans < minNormalizedMeanRNA 
1193
+
1194
+    GRN@data$RNA$counts_metadata$isFiltered = ! GRN@data$RNA$counts_metadata$ID %in% genes.CV
766 1195
 
767 1196
     nRowsFlagged = length(which(GRN@data$RNA$counts_metadata$isFiltered))
768 1197
     
769 1198
     # Raw counts are left untouched and filtered where needed only
770
-    futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows because the row mean was smaller than ", minNormalizedMeanRNA))
1199
+    futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows due to filtering criteria"))
771 1200
     
772 1201
     GRN@config$isFiltered = TRUE
773 1202
   } 
... ...
@@ -793,7 +1222,7 @@ filterData <- function (GRN,
793 1222
   }
794 1223
   
795 1224
 
796
-  futile.logger::flog.info(paste0("Filter and sort peaks by size and remain only those smaller than : ", maxSize_peaks))
1225
+  futile.logger::flog.info(paste0("Filter and sort peaks by size and remain only those smaller than : ", maxSize_peaks, " and bigger than ", minSize_peaks))
797 1226
   futile.logger::flog.info(paste0(" Number of peaks before filtering: ", nrow(GRN@data$peaks$counts_metadata)))
798 1227
   
799 1228
   countsPeaks.clean = GRN@data$peaks$counts_metadata %>%
... ...
@@ -1366,7 +1795,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1366 1795
       tibble::as_tibble()
1367 1796
     
1368 1797
     # TODO replace by getter
1369
-    countsPeaks = .normalizeNewDataWrapper(GRN@data$peaks$counts_orig, normalization = normalization)
1798
+    countsPeaks = .normalizeCountMatrix(GRN@data$peaks$counts_orig, normalization = normalization)
1370 1799
     
1371 1800
     stopifnot(identical(nrow(countsPeaks), nrow(GRN@data$TFs$TF_peak_overlap)))
1372 1801
     
... ...
@@ -1403,9 +1832,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1403 1832
     
1404 1833
     # Store as data frame with both TF names and Ensembl IDs, in analogy to the other types of TF data that can be imported
1405 1834
     GRN@data$TFs[[name]] = TF.activity.m %>%
1406
-      as.data.frame() %>% 
1407
-      tibble::rownames_to_column("TF.name") %>%
1408
-      tibble::as_tibble() %>%
1835
+      tibble::as_tibble(rownames = "TF.name") %>%
1409 1836
       dplyr::left_join(GRN@annotation$TFs, by = "TF.name") %>%
1410 1837
       dplyr::select(.data$ENSEMBL, .data$TF.name, tidyselect::all_of(GRN@config$sharedSamples))
1411 1838
     
... ...
@@ -1425,77 +1852,6 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
1425 1852
   
1426 1853
 }
1427 1854
 
1428
-.normalizeNewDataWrapper <- function(data, normalization, idColumn = "peakID") {
1429
-  
1430
-  if (checkmate::testClass(data, "DESeqDataSet")) {
1431
-    
1432
-    counts_raw = DESeq2::counts(data, normalized = FALSE)
1433
-    
1434
-  } else {
1435
-    
1436
-    # Capture incompatible cases
1437
-    if (normalization == "cyclicLoess" | normalization == "sizeFactors") {
1438
-      message = paste0("Selected normalization method for TF activity (", normalization, ") cannot be performed because the provided counts are not integer only. Select either \"quantile\" or \"none\" as normalization method for TF activity.")
1439
-      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
1440
-    }
1441
-    
1442
-    counts_raw = data
1443
-  }
1444
-  
1445
-  
1446
-  if (normalization == "cyclicLoess") {
1447
-    
1448
-    futile.logger::flog.info(paste0(" Normalizing data using cyclic LOESS"))
1449
-      
1450
-      
1451
-    packageMessage = paste0("The package csaw is not installed but required for the cyclic LOESS normalization. Please install it and re-run this function or change the normalization type (if possible).")
1452
-    .checkPackageInstallation("csaw", packageMessage)   
1453
-      
1454
-    # Perform a cyclic loess normalization
1455
-    # We use a slighlty more complicated setup to derive size factors for library normalization
1456
-    # Instead of just determining the size factors in DeSeq2 via cirtual samples, we use 
1457
-    # a normalization from the csaw package (see https://www.rdocumentation.org/packages/csaw/versions/1.6.1/topics/normOffsets)
1458
-    # and apply a non-linear normalization. 
1459
-    # For each sample, a lowess curve is fitted to the log-counts against the log-average count. 
1460
-    # The fitted value for each bin pair is used as the generalized linear model offset for that sample. 
1461
-    # The use of the average count provides more stability than the average log-count when low counts are present for differentially bound regions.
1462
-    
1463
-    # since counts returns,by default, non-normalized counts, the following code should be fine and there is no need to also run estimateSizeFactors beforehand
1464
-    
1465
-    if (packageVersion("csaw") <= "1.14.1") {
1466
-      normFacs = exp((csaw::normOffsets(data, lib.sizes = colSums(data), type = "loess")))
1467
-    } else {
1468
-      object = SummarizedExperiment::SummarizedExperiment(list(counts=counts_raw))
1469
-      object$totals = colSums(counts_raw)
1470
-      normFacs  = exp(csaw::normOffsets(object, se.out = FALSE))
1471
-    }
1472
-    
1473
-    rownames(normFacs) = rownames(data)
1474
-    colnames(normFacs) = colnames(data)
1475
-    
1476
-    # We now provide gene-specific normalization factors for each sample as a matrix, which will preempt sizeFactors
1477
-    DESeq2::normalizationFactors(data) <- normFacs
1478
-    dataNorm = DESeq2::counts(data, normalized=TRUE)
1479
-    
1480
-  } else if (normalization == "sizeFactors") {
1481
-    
1482
-    futile.logger::flog.info(paste0(" Normalizing data using DESeq size factors"))
1483
-    data = DESeq2::estimateSizeFactors(data)
1484
-    dataNorm = DESeq2::counts(data, normalized=TRUE)
1485
-    
1486
-  } else if (normalization == "quantile") {
1487
-    
1488
-    futile.logger::flog.info(paste0(" Normalizing data using quantile normalization"))
1489
-    dataNorm = .normalizeCounts(data, method = "quantile",  idColumn = idColumn)
1490
-    
1491
-  } else if (normalization == "none") {
1492
-    dataNorm = data
1493
-    futile.logger::flog.info(paste0(" Skip normalization."))
1494
-    # Nothing to do, leave countsPeaks as they are
1495
-  }
1496
-  
1497
-  dataNorm
1498
-}
1499 1855
 
1500 1856
 #' Import externally derived TF Activity data. EXPERIMENTAL.
1501 1857
 #' 
... ...
@@ -1565,7 +1921,8 @@ importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF
1565 1921
     # data = dplyr::select(data, -tidyselect::one_of(nameColumn))
1566 1922
     
1567 1923
     # Only TF.names are unique
1568
-    countsNorm = .normalizeNewDataWrapper(data %>% dplyr::select(-.data$ENSEMBL), normalization = normalization, idColumn = "TF.name")
1924
+    # TODO fix
1925
+    countsNorm = .normalizeCountMatrix(data %>% dplyr::select(-.data$ENSEMBL), normalization = normalization)
1569 1926
     
1570 1927
     # Check overlap of ENSEMBL IDs
1571 1928
     countsNorm$ENSEMBL = data$ENSEMBL
Browse code

small change

Christian Arnold authored on 14/10/2022 15:47:58
Showing1 changed files
... ...
@@ -180,8 +180,8 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
180 180
 
181 181
   checkmate::assertDataFrame(counts_peaks, min.rows = 1, min.cols = 2)
182 182
   checkmate::assertDataFrame(counts_rna, min.rows = 1, min.cols = 2)
183
-  checkmate::assertCharacter(idColumn_peaks, min.chars = 1, len = 1)
184
-  checkmate::assertCharacter(idColumn_RNA, min.chars = 1, len = 1)
183
+  checkmate::assertChoice(idColumn_peaks, colnames(counts_peaks))
184
+  checkmate::assertChoice(idColumn_RNA, colnames(counts_rna))
185 185
   checkmate::assertChoice(normalization_peaks, c("none", "DESeq_sizeFactor", "quantile"))
186 186
   checkmate::assertChoice(normalization_rna, c("none", "DESeq_sizeFactor", "quantile"))  
187 187
   checkmate::assertFlag(allowOverlappingPeaks)
Browse code

Bugfix for .makeObjectCompatible

Christian Arnold authored on 14/10/2022 12:45:16
Showing1 changed files
... ...
@@ -315,14 +315,16 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
315 315
     GRN@data$peaks$counts = .storeAsMatrixOrSparseMatrix(GRN, df = data.l[["peaks"]], ID_column = "peakID", slotName = "GRN@data$peaks$counts")
316 316
 
317 317
     # includeFiltered = TRUE here as it doesnt make a difference and because getCounts requires counts_metadata$isFiltered to be set already
318
-    GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = FALSE, includeFiltered = TRUE)) 
318
+    GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(rownames(GRN@data$peaks$counts)) 
319 319
     stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata))
320 320
     
321 321
     GRN@data$RNA$counts   = .storeAsMatrixOrSparseMatrix(GRN, df =  data.l[["RNA"]], ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts")
322 322
       
323 323
     GRN@data$RNA$counts_metadata = tibble::tibble(ID = data.l[["RNA"]]$ENSEMBL, isFiltered = FALSE)
324 324
     
325
-    GRN@data$RNA$counts_permuted_index = .shuffleColumns(countsRNA.norm.df %>% dplyr::select(-.data$ENSEMBL), .getMaxPermutation(GRN), returnUnshuffled = FALSE, returnAsList = FALSE)
325
+    GRN@data$RNA$counts_permuted_index = sample.int(ncol(GRN@data$RNA$counts), ncol(GRN@data$RNA$counts))
326
+    
327
+    
326 328
     
327 329
     futile.logger::flog.info(paste0( "Final dimensions of data:"))
328 330
     futile.logger::flog.info(paste0( " RNA  : ", nrow(countsRNA.norm.df)  , " x ", ncol(countsRNA.norm.df)   - 2, " (rows x columns)"))
... ...
@@ -402,11 +404,10 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
402 404
     
403 405
 }
404 406
 
405
-.createConsensusPeaksDF <- function(countsPeaks, idColumn = "peakID") {
407
+.createConsensusPeaksDF <- function(peakIDs) {
406 408
   
407
-  checkmate::assertChoice(idColumn, colnames(countsPeaks))
408
-  
409
-  ids.split = strsplit(countsPeaks %>% dplyr::pull(!!(idColumn)), split = "[:-]+")
409
+
410
+  ids.split = strsplit(peakIDs, split = "[:-]+")
410 411
   ids.split.length = sapply(ids.split, length)
411 412
   if (!all(ids.split.length == 3)) {
412 413
     message = paste0(" At least one of the IDs in the peaks data has an unsupported format. Make sure all peakIDs are in the format \"chr:start-end\"")
... ...
@@ -4182,8 +4183,8 @@ generateStatsSummary <- function(GRN,
4182 4183
 }
4183 4184
 
4184 4185
 
4186
+####### Retrieve GRN objects or parts of it #########
4185 4187
 
4186
-####### Misc functions #########
4187 4188
 
4188 4189
 #' Load example GRN dataset
4189 4190
 #' 
... ...
@@ -4199,58 +4200,58 @@ generateStatsSummary <- function(GRN,
4199 4200
 #' GRN = loadExampleObject()
4200 4201
 #' @return An small example \code{\linkS4class{GRN}} object
4201 4202
 loadExampleObject <- function(forceDownload = FALSE, fileURL = "https://www.embl.de/download/zaugg/GRaNIE/GRN.rds") {
4202
-  
4203
-  checkmate::assertFlag(forceDownload)
4204
-  options(timeout=200)
4205 4203
     
4206
-  if (!is.installed("BiocFileCache")) {
4207
-      
4208
-      message = paste0("The package BiocFileCache is not installed, but recommended if you want to speed-up the retrieval of the GRN example object ",
4209
-      "via this function when using it multiple times. If not installed, the example object has to be downloaded anew every time you use this function.")
4210
-      .checkPackageInstallation("BiocFileCache", message, isWarning = TRUE)
4211
-      
4212
-      GRN = readRDS(url(fileURL))
4213
-
4214
-  } else {
4215
-      
4216
-      
4217
-      # Taken and modified from https://www.bioconductor.org/packages/release/bioc/vignettes/BiocFileCache/inst/doc/BiocFileCache.html
4218
-      
4219
-      bfc <- .get_cache()
4220
-      
4221
-      rid <- BiocFileCache::bfcquery(bfc, "GRaNIE_object_example")$rid
4222
-      if (!length(rid)) {
4223
-          rid <- names(BiocFileCache::bfcadd(bfc, "GRaNIE_object_example", fileURL))
4224
-      }
4225
-      if (!isFALSE(BiocFileCache::bfcneedsupdate(bfc, rid)) | forceDownload) {
4226
-          messageStr = paste0("Downloading GRaNIE example object from ", fileURL)
4227
-          message(messageStr)
4228
-          filePath = BiocFileCache::bfcdownload(bfc, rid, ask = FALSE)
4229
-      }
4230
-      
4231
-      
4232
-      filePath = BiocFileCache::bfcrpath(bfc, rids = rid)
4233
-      
4234
-      # Now we can read in the locally stored file
4235
-      GRN = readRDS(filePath)
4236
-  }
4237
-
4238
-  
4239
-  # Change the default path to the current working directory
4240
-  GRN@config$directories$outputRoot = "."
4241
-  GRN@config$directories$output_plots = "."
4242
-  GRN@config$directories$motifFolder = "."
4243
-  GRN@config$files$output_log = "GRN.log"
4244
-  
4245
-  GRN = .makeObjectCompatible(GRN)
4246
-  
4247
-  # TODO remove once done already in the object
4248
-  GRN = add_TF_gene_correlation(GRN)
4249
-  
4250
-  # GRN = add_featureVariation(GRN, formula = "auto", metadata = "mt_frac")
4251
-  
4252
-  GRN
4253
-  
4204
+    checkmate::assertFlag(forceDownload)
4205
+    options(timeout=200)
4206
+    
4207
+    if (!is.installed("BiocFileCache")) {
4208
+        
4209
+        message = paste0("The package BiocFileCache is not installed, but recommended if you want to speed-up the retrieval of the GRN example object ",
4210
+                         "via this function when using it multiple times. If not installed, the example object has to be downloaded anew every time you use this function.")
4211
+        .checkPackageInstallation("BiocFileCache", message, isWarning = TRUE)
4212
+        
4213
+        GRN = readRDS(url(fileURL))
4214
+        
4215
+    } else {
4216
+        
4217
+        
4218
+        # Taken and modified from https://www.bioconductor.org/packages/release/bioc/vignettes/BiocFileCache/inst/doc/BiocFileCache.html
4219
+        
4220
+        bfc <- .get_cache()
4221
+        
4222
+        rid <- BiocFileCache::bfcquery(bfc, "GRaNIE_object_example")$rid
4223
+        if (!length(rid)) {
4224
+            rid <- names(BiocFileCache::bfcadd(bfc, "GRaNIE_object_example", fileURL))
4225
+        }
4226
+        if (!isFALSE(BiocFileCache::bfcneedsupdate(bfc, rid)) | forceDownload) {
4227
+            messageStr = paste0("Downloading GRaNIE example object from ", fileURL)
4228
+            message(messageStr)
4229
+            filePath = BiocFileCache::bfcdownload(bfc, rid, ask = FALSE)
4230
+        }
4231
+        
4232
+        
4233
+        filePath = BiocFileCache::bfcrpath(bfc, rids = rid)
4234
+        
4235
+        # Now we can read in the locally stored file
4236
+        GRN = readRDS(filePath)
4237
+    }
4238
+    
4239
+    
4240
+    # Change the default path to the current working directory
4241
+    GRN@config$directories$outputRoot = "."
4242
+    GRN@config$directories$output_plots = "."
4243
+    GRN@config$directories$motifFolder = "."
4244
+    GRN@config$files$output_log = "GRN.log"
4245
+    
4246
+    GRN = .makeObjectCompatible(GRN)
4247
+    
4248
+    # TODO remove once done already in the object
4249
+    GRN = add_TF_gene_correlation(GRN)
4250
+    
4251
+    # GRN = add_featureVariation(GRN, formula = "auto", metadata = "mt_frac")
4252
+    
4253
+    GRN
4254
+    
4254 4255
 }
4255 4256
 
4256 4257
 
... ...
@@ -4273,90 +4274,90 @@ loadExampleObject <- function(forceDownload = FALSE, fileURL = "https://www.embl
4273 4274
 #' counts.df = getCounts(GRN, type = "peaks", permuted = FALSE)
4274 4275
 #' @return Data frame of counts, with the type as indicated by the function parameters. This function does **NOT** return a \code{\linkS4class{GRN}} object.
4275 4276
 getCounts <- function(GRN, type,  permuted = FALSE, asMatrix = FALSE, includeIDColumn = TRUE, includeFiltered = FALSE) {
4276
-  
4277
-  checkmate::assertClass(GRN, "GRN")
4278
-  GRN = .addFunctionLogToObject(GRN)     
4279
-  
4280
-  GRN = .makeObjectCompatible(GRN)
4281
-  
4282
-  checkmate::assertChoice(type, c("peaks", "rna"))
4283
-  checkmate::assertFlag(permuted)
4284
-  
4285
-  permIndex = dplyr::if_else(permuted, "1", "0")
4286
-  
4287
-  if (type == "peaks") {
4288 4277
     
4289
-    if (permuted) {
4290
-      message = "Could not find permuted peak counts in GRN object. Peaks are not stored as permuted, set permuted = FALSE for type = \"peaks\""
4291
-      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4292
-    }
4278
+    checkmate::assertClass(GRN, "GRN")
4279
+    GRN = .addFunctionLogToObject(GRN)     
4293 4280
     
4294
-    classPeaks = class(GRN@data$peaks$counts)
4295
-    if ("matrix" %in% classPeaks) {
4296
-      result = GRN@data$peaks$counts
4297
-    } else if ("dgCMatrix" %in% classPeaks) {
4298
-      result = .asMatrixFromSparse(GRN@data$peaks$counts, convertZero_to_NA = FALSE)
4299
-    } else {
4300
-      message = paste0("Unsupported class for GRN@data$peaks$counts. Contact the authors.")
4301
-      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4302
-    }
4281
+    GRN = .makeObjectCompatible(GRN)
4303 4282
     
4283
+    checkmate::assertChoice(type, c("peaks", "rna"))
4284
+    checkmate::assertFlag(permuted)
4304 4285
     
4305
-  } else if (type == "rna") {
4286
+    permIndex = dplyr::if_else(permuted, "1", "0")
4306 4287
     
4307
-    classRNA = class(GRN@data$RNA$counts)
4308
-    if ("matrix" %in% classRNA) {
4309
-      result = GRN@data$RNA$counts
4310
-    } else if ("dgCMatrix" %in% classRNA) {
4311
-      result = .asMatrixFromSparse(GRN@data$RNA$counts, convertZero_to_NA = FALSE)
4312
-    } else {
4313
-      message = paste0("Unsupported class for GRN@data$RNA$counts. Contact the authors.")
4314
-      .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4288
+    if (type == "peaks") {
4289
+        
4290
+        if (permuted) {
4291
+            message = "Could not find permuted peak counts in GRN object. Peaks are not stored as permuted, set permuted = FALSE for type = \"peaks\""
4292
+            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4293
+        }
4294
+        
4295
+        classPeaks = class(GRN@data$peaks$counts)
4296
+        if ("matrix" %in% classPeaks) {
4297
+            result = GRN@data$peaks$counts
4298
+        } else if ("dgCMatrix" %in% classPeaks) {
4299
+            result = .asMatrixFromSparse(GRN@data$peaks$counts, convertZero_to_NA = FALSE)
4300
+        } else {
4301
+            message = paste0("Unsupported class for GRN@data$peaks$counts. Contact the authors.")
4302
+            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4303
+        }
4304
+        
4305
+        
4306
+    } else if (type == "rna") {
4307
+        
4308
+        classRNA = class(GRN@data$RNA$counts)
4309
+        if ("matrix" %in% classRNA) {
4310
+            result = GRN@data$RNA$counts
4311
+        } else if ("dgCMatrix" %in% classRNA) {
4312
+            result = .asMatrixFromSparse(GRN@data$RNA$counts, convertZero_to_NA = FALSE)
4313
+        } else {
4314
+            message = paste0("Unsupported class for GRN@data$RNA$counts. Contact the authors.")
4315
+            .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
4316
+        }
4317
+        
4315 4318
     }
4316 4319
     
4317
-  }
4318
-  
4319
-  
4320
-  if (permuted) {
4321 4320
     
4322
-    if (type == "rna") {
4323
-      
4324
-      # Columns are shuffled so that non-matching samples are compared throughout the pipeline for all correlation-based analyses
4325
-      colnames(result) = colnames(result)[GRN@data$RNA$counts_permuted_index]
4326
-    }
4327
-  } 
4328
-  
4329
-  if (!includeFiltered) {
4321
+    if (permuted) {
4322
+        
4323
+        if (type == "rna") {
4324
+            
4325
+            # Columns are shuffled so that non-matching samples are compared throughout the pipeline for all correlation-based analyses
4326
+            colnames(result) = colnames(result)[GRN@data$RNA$counts_permuted_index]
4327
+        }
4328
+    } 
4330 4329
     
4331
-    if (type == "rna") {
4332
-      nonFiltered = which(! GRN@data$RNA$counts_metadata$isFiltered)
4333
-    } else {
4334
-      nonFiltered = which(! GRN@data$peaks$counts_metadata$isFiltered)
4330
+    if (!includeFiltered) {
4331
+        
4332
+        if (type == "rna") {
4333
+            nonFiltered = which(! GRN@data$RNA$counts_metadata$isFiltered)
4334
+        } else {
4335
+            nonFiltered = which(! GRN@data$peaks$counts_metadata$isFiltered)
4336
+        }
4337
+        
4338
+        result = result[nonFiltered,]
4335 4339
     }
4336 4340
     
4337
-    result = result[nonFiltered,]
4338
-  }
4339
-
4340
-  
4341
-  
4342
-  if (!asMatrix) {
4343
- 
4344
-    result.df =  result %>%
4345
-      as.data.frame()
4346 4341
     
4347
-    if (includeIDColumn)  {
4348
-      
4349
-      ID_column = dplyr::if_else(type == "rna", "ENSEMBL", "peakID")
4350
-      
4351
-      result.df = result.df %>%
4352
-        tibble::rownames_to_column(ID_column) 
4353
-    } 
4354 4342
     
4355
-    return(result.df)
4356
-
4357
-  }
4358
-  
4359
-  result
4343
+    if (!asMatrix) {
4344
+        
4345
+        result.df =  result %>%
4346
+            as.data.frame()
4347
+        
4348
+        if (includeIDColumn)  {
4349
+            
4350
+            ID_column = dplyr::if_else(type == "rna", "ENSEMBL", "peakID")
4351
+            
4352
+            result.df = result.df %>%
4353
+                tibble::rownames_to_column(ID_column) 
4354
+        } 
4355
+        
4356
+        return(result.df)
4357
+        
4358
+    }
4359
+    
4360
+    result
4360 4361
 }
4361 4362
 
4362 4363
 
... ...
@@ -4391,137 +4392,601 @@ getGRNConnections <- function(GRN, type = "all.filtered",  permuted = FALSE,
4391 4392
                               include_peakMetadata = FALSE,
4392 4393
                               include_geneMetadata = FALSE,
4393 4394
                               include_variancePartitionResults = FALSE) {
4394
-  
4395
-  checkmate::assertClass(GRN, "GRN")  
4396
-  GRN = .addFunctionLogToObject(GRN)
4397
-  
4398
-  GRN = .makeObjectCompatible(GRN)
4399
-   
4400
-  checkmate::assertChoice(type, c("TF_peaks", "peak_genes", "TF_genes", "all.filtered"))
4401
-  checkmate::assertFlag(permuted)
4402
-  #checkmate::assertIntegerish(permutation, lower = 0, upper = .getMaxPermutation(GRN))
4403
-  checkmate::assertFlag(include_TF_gene_correlations)
4404