... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
Package: GRaNIE |
2 | 2 |
Title: GRaNIE: Reconstruction cell type specific gene regulatory networks including enhancers using chromatin accessibility and RNA-seq data |
3 |
-Version: 1.1.21 |
|
3 |
+Version: 1.1.22 |
|
4 | 4 |
Encoding: UTF-8 |
5 | 5 |
Authors@R: c(person("Christian", "Arnold", email = |
6 | 6 |
"chrarnold@web.de", role = c("cre","aut")), |
... | ... |
@@ -36,7 +36,6 @@ initializeGRN <- function(objectMetadata = list(), |
36 | 36 |
checkmate::assert(checkmate::checkNull(objectMetadata), checkmate::checkList(objectMetadata)) |
37 | 37 |
checkmate::assertChoice(genomeAssembly, c("hg19","hg38", "mm9", "mm10")) |
38 | 38 |
|
39 |
- .checkAndLoadPackagesGenomeAssembly(genomeAssembly) |
|
40 | 39 |
|
41 | 40 |
# Create the folder first if not yet existing |
42 | 41 |
checkmate::assertCharacter(outputFolder, min.chars = 1, len = 1) |
... | ... |
@@ -123,7 +122,7 @@ initializeGRN <- function(objectMetadata = list(), |
123 | 122 |
GRN |
124 | 123 |
} |
125 | 124 |
|
126 |
-######## Add and filter data ######## |
|
125 |
+######## Add data and annotation ######## |
|
127 | 126 |
|
128 | 127 |
#' Add data to a \code{\linkS4class{GRN}} object. |
129 | 128 |
#' |
... | ... |
@@ -132,14 +131,18 @@ initializeGRN <- function(objectMetadata = list(), |
132 | 131 |
#' |
133 | 132 |
#' If the \code{ChIPseeker} package is installed, additional peak annotation is provided in the annotation slot and a peak annotation QC plot is produced as part of peak-gene QC. |
134 | 133 |
#' This is fully optional, however, and has no consequences for downstream functions. |
134 |
+#' Normalizing the data sensibly is very important. When \code{quantile}is chose, \code{limma::normalizeQuantiles} is used, which in essence does the following: |
|
135 |
+#' Each quantile of each column is set to the mean of that quantile across arrays. The intention is to make all the normalized columns have the same empirical distribution. |
|
136 |
+#' This will be exactly true if there are no missing values and no ties within the columns: the normalized columns are then simply permutations of one another. |
|
135 | 137 |
#' |
136 | 138 |
#' @export |
137 | 139 |
#' @template GRN |
138 | 140 |
#' @param counts_peaks Data frame. No default. Counts for the peaks, with raw or normalized counts for each peak (rows) across all samples (columns). |
139 | 141 |
#' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_peaks} below. |
140 | 142 |
#' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table. |
141 |
-#' @param normalization_peaks Character. Default \code{DESeq_sizeFactor}. |
|
142 |
-#' Normalization procedure for peak data. Must be one of \code{DESeq_sizeFactor}, \code{none}, or \code{quantile}. |
|
143 |
+#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactor}. Normalization procedure for peak data. |
|
144 |
+#' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, |
|
145 |
+#' \code{EDASeq_GC_peaks}, \code{gcqn_peaks}, \code{DESeq2_sizeFactors}, \code{none}. |
|
143 | 146 |
#' @param idColumn_peaks Character. Default \code{peakID}. Name of the column in the counts_peaks data frame that contains peak IDs. |
144 | 147 |
#' The required format must be {chr}:{start}-{end}", with {chr} denoting the abbreviated chromosome name, and {start} and {end} the begin and end |
145 | 148 |
#' of the peak coordinates, respectively. End must be bigger than start. Examples for valid peak IDs are \code{chr1:400-800} or \code{chrX:20-25}. |
... | ... |
@@ -147,12 +150,19 @@ initializeGRN <- function(objectMetadata = list(), |
147 | 150 |
#' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_rna} below. |
148 | 151 |
#' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table. |
149 | 152 |
#' @param normalization_rna Character. Default \code{quantile}. Normalization procedure for peak data. |
150 |
-#' Must be one of "DESeq_sizeFactor", "none", or "quantile". For "quantile", \code{limma::normalizeQuantiles} is used for normalization. |
|
153 |
+#' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, \code{DESeq2_sizeFactors}, \code{none}. |
|
151 | 154 |
#' @param idColumn_RNA Character. Default \code{ENSEMBL}. Name of the column in the \code{counts_rna} data frame that contains Ensembl IDs. |
152 | 155 |
#' @param sampleMetadata Data frame. Default \code{NULL}. Optional, additional metadata for the samples, such as age, sex, gender etc. |
153 | 156 |
#' If provided, the @seealso [plotPCA_all()] function can then incorporate and plot it. Sample names must match with those from both peak and RNA-Seq data. The first column is expected to contain the sample IDs, the actual column name is irrelevant. |
157 |
+#' @param additionalParams.l Named list. Default \code{list()}. Additional parameters for the chosen normalization method. |
|
158 |
+#' Currently, only the GC-aware normalization methods \code{EDASeq_GC_peaks} and \code{gcqn_peaks} are supported here. |
|
159 |
+#' Both support the parameters \code{roundResults} (logical flag, \code{TRUE} or \code{FALSE}) and \code{nBins} (Integer > 0), and \code{EDASeq_GC_peaks} supports three additional parameters: |
|
160 |
+#' \code{withinLane_method} (one of: "loess","median","upper","full") and \code{betweenLane_method} (one of: "median","upper","full"). |
|
161 |
+#' For more information, see the EDASeq vignette. |
|
154 | 162 |
#' @param allowOverlappingPeaks \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should overlapping peaks be allowed (then only a warning is issued |
155 | 163 |
#' when overlapping peaks are found) or (the default) should an error be raised? |
164 |
+#' @param keepOriginalReadCounts \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should the original read counts as provided to the function be kept in addition to |
|
165 |
+#' storing the rad counts after a (if any) normalization? This increases the memory footprint of the object because 2 additional count matrices have to be stored. |
|
156 | 166 |
#' @template forceRerun |
157 | 167 |
#' @return An updated \code{\linkS4class{GRN}} object, with added data from this function (e.g., slots \code{GRN@data$peaks} and \code{GRN@data$RNA}) |
158 | 168 |
#' @seealso \code{\link{plotPCA_all}} |
... | ... |
@@ -166,11 +176,11 @@ initializeGRN <- function(objectMetadata = list(), |
166 | 176 |
#' # We omit sampleMetadata = meta.df in the following line, becomes too long otherwise |
167 | 177 |
#' # GRN = addData(GRN, counts_peaks = peaks.df, counts_rna = rna.df, forceRerun = FALSE) |
168 | 178 |
|
169 |
-# TODO: add a isSingleCell argument or something similar |
|
170 |
- |
|
171 |
-addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", idColumn_peaks = "peakID", |
|
179 |
+addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactor", idColumn_peaks = "peakID", |
|
172 | 180 |
counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL, |
181 |
+ additionalParams.l = list(), |
|
173 | 182 |
allowOverlappingPeaks= FALSE, |
183 |
+ keepOriginalReadCounts = FALSE, |
|
174 | 184 |
forceRerun = FALSE) { |
175 | 185 |
|
176 | 186 |
start = Sys.time() |
... | ... |
@@ -182,8 +192,20 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
182 | 192 |
checkmate::assertDataFrame(counts_rna, min.rows = 1, min.cols = 2) |
183 | 193 |
checkmate::assertChoice(idColumn_peaks, colnames(counts_peaks)) |
184 | 194 |
checkmate::assertChoice(idColumn_RNA, colnames(counts_rna)) |
185 |
- checkmate::assertChoice(normalization_peaks, c("none", "DESeq_sizeFactor", "quantile")) |
|
186 |
- checkmate::assertChoice(normalization_rna, c("none", "DESeq_sizeFactor", "quantile")) |
|
195 |
+ |
|
196 |
+ checkmate::assertChoice(normalization_peaks, |
|
197 |
+ choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", |
|
198 |
+ "csaw_cyclicLoess_orig", "csaw_TMM", |
|
199 |
+ "EDASeq_GC_peaks", "gcqn_peaks", |
|
200 |
+ "DESeq2_sizeFactors", |
|
201 |
+ "none")) |
|
202 |
+ checkmate::assertChoice(normalization_rna, |
|
203 |
+ choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", |
|
204 |
+ "csaw_cyclicLoess_orig", "csaw_TMM", |
|
205 |
+ "DESeq2_sizeFactors", |
|
206 |
+ "none")) |
|
207 |
+ |
|
208 |
+ checkmate::assertFlag(keepOriginalReadCounts) |
|
187 | 209 |
checkmate::assertFlag(allowOverlappingPeaks) |
188 | 210 |
checkmate::assertFlag(forceRerun) |
189 | 211 |
|
... | ... |
@@ -248,29 +270,49 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
248 | 270 |
# dplyr::summarise_if(is.numeric, sum, .groups = 'drop') # the .drop caused an error with dplyr 1.0.5 |
249 | 271 |
} |
250 | 272 |
|
273 |
+ ## STORE COUNTS METADATA ## |
|
251 | 274 |
|
252 |
- # Normalize counts |
|
253 |
- countsPeaks.norm.df = .normalizeCounts(counts_peaks, method = normalization_peaks, idColumn = "peakID") |
|
254 |
- countsRNA.norm.df = .normalizeCounts(counts_rna, method = normalization_rna, idColumn = "ENSEMBL") |
|
275 |
+ GRN@data$RNA$counts_metadata = tibble::tibble(ID = counts_rna$ENSEMBL, isFiltered = FALSE) |
|
255 | 276 |
|
256 |
- GRN@config$parameters$normalization_rna = normalization_rna |
|
257 |
- GRN@config$parameters$normalization_peaks = normalization_peaks |
|
277 |
+ GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(counts_peaks$peakID) |
|
278 |
+ stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata)) |
|
258 | 279 |
|
259 |
- # We have our first connection type, the default one; more may be added later |
|
260 |
- GRN@config$TF_peak_connectionTypes = "expression" |
|
280 |
+ # Calculate GC content of peaks which we need before any normalization |
|
261 | 281 |
|
262 |
- # Make sure ENSEMBL is the first column |
|
263 |
- countsRNA.norm.df = dplyr::select(countsRNA.norm.df, .data$ENSEMBL, tidyselect::everything()) |
|
264 |
- countsPeaks.norm.df = dplyr::select(countsPeaks.norm.df, .data$peakID, tidyselect::everything()) |
|
282 |
+ if (normalization_peaks %in% c("EDASeq_GC_peaks", "gcqn_peaks")) { |
|
283 |
+ |
|
284 |
+ # Now we need the genome annotation packages to calculate the GC content of the peak regions |
|
285 |
+ .checkAndLoadPackagesGenomeAssembly(GRN@config$parameters$genomeAssembly) |
|
286 |
+ |
|
287 |
+ GC.data.df = .calcGCContentPeaks(GRN) |
|
288 |
+ additionalParams.l[["GC_data"]] = GC.data.df |
|
289 |
+ } |
|
290 |
+ |
|
265 | 291 |
|
266 |
- samples_rna = colnames(countsRNA.norm.df) |
|
267 |
- samples_peaks = colnames(countsPeaks.norm.df) |
|
268 |
- allSamples = unique(c(samples_rna, samples_peaks)) %>% setdiff(c("ENSEMBL", "isFiltered", "peakID")) |
|
292 |
+ ## Normalize counts ## |
|
293 |
+ countsPeaks.norm.df = .normalizeCountMatrix(GRN, counts_peaks %>% tibble::column_to_rownames("peakID") %>% as.matrix(), |
|
294 |
+ normalization = normalization_peaks, |
|
295 |
+ additionalParams = additionalParams.l |
|
296 |
+ ) %>% |
|
297 |
+ as_tibble(rownames = "peakID") %>% |
|
298 |
+ dplyr::select(.data$peakID, tidyselect::everything()) |
|
269 | 299 |
|
270 |
- # Subset data to retain only samples that appear in both RNA and peaks |
|
271 |
- data.l = .intersectData(countsRNA.norm.df, countsPeaks.norm.df) |
|
300 |
+ countsRNA.norm.df = .normalizeCountMatrix(GRN, counts_rna %>% tibble::column_to_rownames("ENSEMBL") %>% as.matrix(), |
|
301 |
+ normalization = normalization_rna, |
|
302 |
+ additionalParams = additionalParams.l) %>% |
|
303 |
+ as_tibble(rownames = "ENSEMBL") %>% |
|
304 |
+ dplyr::select(.data$ENSEMBL, tidyselect::everything()) |
|
272 | 305 |
|
273 |
- # Generate metadata first to determine the nmberof shared samples etc |
|
306 |
+ |
|
307 |
+ ## SAMPLE AND GRN METADATA ## |
|
308 |
+ GRN@config$parameters$normalization_rna = normalization_rna |
|
309 |
+ GRN@config$parameters$normalization_peaks = normalization_peaks |
|
310 |
+ |
|
311 |
+ samples_rna = colnames(countsRNA.norm.df) |
|
312 |
+ samples_peaks = colnames(countsPeaks.norm.df) |
|
313 |
+ allSamples = unique(c(samples_rna, samples_peaks)) %>% setdiff(c("ENSEMBL", "isFiltered", "peakID")) |
|
314 |
+ |
|
315 |
+ # Generate metadata first to determine the number of shared samples etc |
|
274 | 316 |
if (!is.null(sampleMetadata)) { |
275 | 317 |
|
276 | 318 |
futile.logger::flog.info("Parsing provided metadata...") |
... | ... |
@@ -308,23 +350,30 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
308 | 350 |
) |
309 | 351 |
|
310 | 352 |
GRN@config$sharedSamples = dplyr::filter(GRN@data$metadata, .data$has_both) %>% dplyr::pull(.data$sampleID) %>% as.character() |
353 |
+ # We have our first connection type, the default one; more may be added later |
|
354 |
+ GRN@config$TF_peak_connectionTypes = "expression" |
|
311 | 355 |
|
312 |
- ### COUNT MATRICES |
|
313 |
- # Store the matrices either as normal or sparse matrix |
|
356 |
+ ## STORE FINAL COUNT MATRICES ## |
|
357 |
+ |
|
358 |
+ # Subset data to retain only samples that appear in both RNA and peaks |
|
359 |
+ data.l = .intersectData(countsRNA.norm.df, countsPeaks.norm.df) |
|
314 | 360 |
|
361 |
+ # Store the matrices either as normal or sparse matrix |
|
315 | 362 |
GRN@data$peaks$counts = .storeAsMatrixOrSparseMatrix(GRN, df = data.l[["peaks"]], ID_column = "peakID", slotName = "GRN@data$peaks$counts") |
316 |
- |
|
317 |
- # includeFiltered = TRUE here as it doesnt make a difference and because getCounts requires counts_metadata$isFiltered to be set already |
|
318 |
- GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(rownames(GRN@data$peaks$counts)) |
|
319 |
- stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata)) |
|
320 | 363 |
|
364 |
+ if (keepOriginalReadCounts) { |
|
365 |
+ GRN@data$peaks$counts_raw = .storeAsMatrixOrSparseMatrix(GRN, df = counts_peaks %>% dplyr::select("peakID", tidyselect::one_of(GRN@config$sharedSamples)), |
|
366 |
+ ID_column = "peakID", slotName = "GRN@data$peaks$counts_raw") |
|
367 |
+ } |
|
368 |
+ |
|
321 | 369 |
GRN@data$RNA$counts = .storeAsMatrixOrSparseMatrix(GRN, df = data.l[["RNA"]], ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts") |
322 |
- |
|
323 |
- GRN@data$RNA$counts_metadata = tibble::tibble(ID = data.l[["RNA"]]$ENSEMBL, isFiltered = FALSE) |
|
324 |
- |
|
325 |
- GRN@data$RNA$counts_permuted_index = sample.int(ncol(GRN@data$RNA$counts), ncol(GRN@data$RNA$counts)) |
|
326 | 370 |
|
371 |
+ if (keepOriginalReadCounts) { |
|
372 |
+ GRN@data$RNA$counts_raw = .storeAsMatrixOrSparseMatrix(GRN, df = counts_rna %>% dplyr::select("ENSEMBL", tidyselect::one_of(GRN@config$sharedSamples)), |
|
373 |
+ ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts_raw") |
|
374 |
+ } |
|
327 | 375 |
|
376 |
+ GRN@data$RNA$counts_permuted_index = sample.int(ncol(GRN@data$RNA$counts), ncol(GRN@data$RNA$counts)) |
|
328 | 377 |
|
329 | 378 |
futile.logger::flog.info(paste0( "Final dimensions of data:")) |
330 | 379 |
futile.logger::flog.info(paste0( " RNA : ", nrow(countsRNA.norm.df) , " x ", ncol(countsRNA.norm.df) - 2, " (rows x columns)")) |
... | ... |
@@ -332,45 +381,23 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
332 | 381 |
# Create permutations for RNA |
333 | 382 |
futile.logger::flog.info(paste0( "Generate ", .getMaxPermutation(GRN), " permutations of RNA-counts")) |
334 | 383 |
|
384 |
+ futile.logger::flog.info(paste0("Check for overlapping peaks...")) |
|
335 | 385 |
|
336 |
- futile.logger::flog.info(paste0("Creating consensus peaks and check for overlapping peaks...")) |
|
337 |
- |
|
338 |
- # Consensus peaks |
|
339 |
- |
|
340 |
- # Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details |
|
341 |
- consensus.gr = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE) |
|
386 |
+ .checkOverlappingPeaks(GRN, allowOverlappingPeaks = allowOverlappingPeaks) |
|
342 | 387 |
|
343 |
- overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) >1) |
|
388 |
+ ## PEAK AND GENE ANNOTATION ## |
|
389 |
+ futile.logger::flog.info(paste0("Adding peak and gene annotation...")) |
|
344 | 390 |
|
345 |
- if (length(overlappingPeaks) > 0) { |
|
346 |
- |
|
347 |
- ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID |
|
348 |
- |
|
349 |
- messageAll = paste0(" ", length(overlappingPeaks), |
|
350 |
- " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","), |
|
351 |
- ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. " |
|
352 |
- ) |
|
353 |
- |
|
354 |
- |
|
355 |
- if (allowOverlappingPeaks) { |
|
356 |
- |
|
357 |
- message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.") |
|
358 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) |
|
359 |
- } else { |
|
360 |
- message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function") |
|
361 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
362 |
- } |
|
363 |
- |
|
364 |
- } |
|
391 |
+ GRN = .populatePeakAnnotation(GRN) |
|
392 |
+ GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") |
|
393 |
+ |
|
394 |
+ # Additional GC statistics, not used at the moment currently |
|
395 |
+ GRN = .calcAdditionalGCStatistics(GRN, GC.data.df) |
|
396 |
+ |
|
397 |
+ GRN = .populateGeneAnnotation(GRN) |
|
365 | 398 |
|
366 | 399 |
} |
367 |
- futile.logger::flog.info(paste0("Adding peak and gene annotation...")) |
|
368 | 400 |
|
369 |
- # Add peak annotation once |
|
370 |
- GRN = .populatePeakAnnotation(GRN) |
|
371 |
- |
|
372 |
- # Add gene annotation once |
|
373 |
- GRN = .populateGeneAnnotation(GRN) |
|
374 | 401 |
|
375 | 402 |
.printExecutionTime(start, prefix = "") |
376 | 403 |
|
... | ... |
@@ -378,6 +405,36 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
378 | 405 |
|
379 | 406 |
} |
380 | 407 |
|
408 |
+.checkOverlappingPeaks <- function (GRN, allowOverlappingPeaks) { |
|
409 |
+ |
|
410 |
+ # Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details |
|
411 |
+ consensus.gr = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE) |
|
412 |
+ |
|
413 |
+ overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) >1) |
|
414 |
+ |
|
415 |
+ if (length(overlappingPeaks) > 0) { |
|
416 |
+ |
|
417 |
+ ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID |
|
418 |
+ |
|
419 |
+ messageAll = paste0(" ", length(overlappingPeaks), |
|
420 |
+ " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","), |
|
421 |
+ ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. " |
|
422 |
+ ) |
|
423 |
+ |
|
424 |
+ |
|
425 |
+ if (allowOverlappingPeaks) { |
|
426 |
+ |
|
427 |
+ message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.") |
|
428 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) |
|
429 |
+ } else { |
|
430 |
+ message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function") |
|
431 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
432 |
+ } |
|
433 |
+ |
|
434 |
+ } |
|
435 |
+} |
|
436 |
+ |
|
437 |
+ |
|
381 | 438 |
.storeAsMatrixOrSparseMatrix <- function (GRN, df, ID_column, slotName, threshold = 0.1) { |
382 | 439 |
|
383 | 440 |
stopifnot(identical(GRN@config$sharedSamples, colnames(df)[-1])) |
... | ... |
@@ -497,11 +554,18 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
497 | 554 |
|
498 | 555 |
GRN@annotation$peaks = metadata_peaks |
499 | 556 |
|
500 |
- if (!is.installed("ChIPseeker")) { |
|
501 |
- packageMessage = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", |
|
502 |
- " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.") |
|
503 |
- .checkPackageInstallation("ChIPseeker", packageMessage, isWarning = TRUE) |
|
557 |
+ if (!is.installed("ChIPseeker") | !.checkAndLoadPackagesGenomeAssembly(GRN@config$parameters$genomeAssembly, returnLogical = TRUE)) { |
|
558 |
+ if (!is.installed("ChIPseeker")) { |
|
559 |
+ packageMessage = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", |
|
560 |
+ " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.") |
|
561 |
+ .checkPackageInstallation("ChIPseeker", packageMessage, isWarning = TRUE) |
|
562 |
+ } else { |
|
563 |
+ |
|
564 |
+ # annotation packages missing, message will already been thrown |
|
565 |
+ } |
|
566 |
+ |
|
504 | 567 |
} else { |
568 |
+ |
|
505 | 569 |
|
506 | 570 |
futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while")) |
507 | 571 |
genomeAssembly = GRN@config$parameters$genomeAssembly |
... | ... |
@@ -555,12 +619,6 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
555 | 619 |
|
556 | 620 |
} |
557 | 621 |
|
558 |
- |
|
559 |
- |
|
560 |
- |
|
561 |
- # Also add GC content as annotation columns |
|
562 |
- GRN = .calcGCContentPeaks(GRN) |
|
563 |
- |
|
564 | 622 |
GRN |
565 | 623 |
|
566 | 624 |
} |
... | ... |
@@ -600,7 +658,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
600 | 658 |
} |
601 | 659 |
|
602 | 660 |
#' @importFrom rlang .data `:=` |
603 |
-.calcGCContentPeaks <- function(GRN) { |
|
661 |
+.calcGCContentPeaks <- function(GRN, nBins = 10) { |
|
604 | 662 |
|
605 | 663 |
futile.logger::flog.info(paste0("Calculate GC-content for peaks. This may take a while")) |
606 | 664 |
start = Sys.time() |
... | ... |
@@ -616,58 +674,429 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
616 | 674 |
# Get DNAStringSet object |
617 | 675 |
seqs_peaks = Biostrings::getSeq(genome, query) |
618 | 676 |
|
619 |
- GC_content.df = Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT") |
|
620 |
- |
|
621 |
- GC_content.df = GC_content.df %>% |
|
677 |
+ GC_content.df = (Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT")) %>% |
|
622 | 678 |
tibble::as_tibble() %>% |
623 |
- dplyr::mutate(length = GenomicRanges::width(query), |
|
679 |
+ dplyr::mutate(peak.width = GenomicRanges::width(query), |
|
624 | 680 |
peak.ID = query$peakID, |
625 |
- GC_class = cut(.data$`G|C`, breaks = seq(0,1,0.1), include.lowest = TRUE, ordered_result = TRUE)) |
|
626 |
- |
|
627 |
- GC_classes.df = GC_content.df %>% |
|
628 |
- dplyr::group_by(.data$GC_class) %>% |
|
629 |
- dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(length), peak_width_sd = sd(length)) %>% |
|
630 |
- dplyr::ungroup() %>% |
|
631 |
- tidyr::complete(.data$GC_class, fill = list(n = 0)) %>% |
|
632 |
- dplyr::mutate(n_rel = .data$n / length(query)) |
|
633 |
- |
|
634 |
- # TODO: Put where |
|
635 |
- #ggplot2::ggplot(GC_content.df, ggplot2::aes(GC.class)) + geom_histogram(stat = "count") + ggplot2::theme_bw() |
|
636 |
- |
|
637 |
- #ggplot2::ggplot(GC_classes.df , ggplot2::aes(GC.class, n_rel)) + geom_bar(stat = "identity") + ggplot2::theme_bw() |
|
638 |
- |
|
639 |
- GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC_content.df, by = "peak.ID") %>% |
|
640 |
- dplyr::rename( peak.GC.perc = .data$`G|C`, |
|
641 |
- peak.width = .data$length, |
|
642 |
- peak.GC.class = .data$GC_class) |
|
643 |
- |
|
644 |
- GRN@stats$peaks = list() |
|
645 |
- GRN@stats$peaks$GC = GC_classes.df |
|
681 |
+ peak.GC.class = cut(.data$`G|C`, breaks = seq(0,1,1/nBins), include.lowest = TRUE, ordered_result = TRUE)) %>% |
|
682 |
+ dplyr::rename(peak.GC.perc = .data$`G|C`) %>% |
|
683 |
+ dplyr::select(peak.ID, everything()) |
|
646 | 684 |
|
685 |
+ |
|
647 | 686 |
.printExecutionTime(start) |
648 | 687 |
|
649 |
- GRN |
|
688 |
+ GC_content.df |
|
689 |
+} |
|
690 |
+ |
|
691 |
+.calcAdditionalGCStatistics <- function (GRN, GC.data) { |
|
692 |
+ |
|
693 |
+ GC_classes.df = GC.data %>% |
|
694 |
+ dplyr::group_by(.data$peak.GC.class) %>% |
|
695 |
+ dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(.data$peak.width), peak_width_sd = sd(.data$peak.width)) %>% |
|
696 |
+ dplyr::ungroup() %>% |
|
697 |
+ tidyr::complete(.data$peak.GC.class, fill = list(n = 0)) %>% |
|
698 |
+ dplyr::mutate(n_rel = .data$n / length(query)) |
|
699 |
+ |
|
700 |
+ # TODO: Put where |
|
701 |
+ #ggplot2::ggplot(GC.data, ggplot2::aes(GC.class)) + geom_histogram(stat = "count") + ggplot2::theme_bw() |
|
702 |
+ |
|
703 |
+ #ggplot2::ggplot(GC_classes.df , ggplot2::aes(GC.class, n_rel)) + geom_bar(stat = "identity") + ggplot2::theme_bw() |
|
704 |
+ |
|
705 |
+ |
|
706 |
+ GRN@stats$peaks = list() |
|
707 |
+ GRN@stats$peaks$GC = GC_classes.df |
|
708 |
+ |
|
709 |
+ GRN |
|
650 | 710 |
} |
651 | 711 |
|
712 |
+ |
|
713 |
+####### FILTER AND NORMALIZE DATA ############ |
|
714 |
+ |
|
715 |
+.normalizeCountMatrix <- function(GRN, data, normalization, additionalParams =list()) { |
|
716 |
+ |
|
717 |
+ checkmate::assertMatrix(data) |
|
718 |
+ checkmate::assertChoice(normalization, |
|
719 |
+ choices = c("limma_cyclicloess", "limma_quantile", "limma_scale", |
|
720 |
+ "csaw_cyclicLoess_orig", "csaw_TMM", |
|
721 |
+ "EDASeq_GC_peaks", "gcqn_peaks", |
|
722 |
+ "DESeq2_sizeFactors", |
|
723 |
+ "none")) |
|
724 |
+ |
|
725 |
+ # Create a DESeq2 object |
|
726 |
+ if (normalization == "DESeq2_sizeFactors" | normalization == "csaw_cyclicLoess_orig" | normalization == "csaw_TMM") { |
|
727 |
+ |
|
728 |
+ dd <- suppressMessages(DESeq2::DESeqDataSetFromMatrix(countData = data, |
|
729 |
+ colData = data.frame( sampleID = colnames(data)), |
|
730 |
+ design = stats::as.formula(" ~ 1"))) |
|
731 |
+ } |
|
732 |
+ |
|
733 |
+ # Common parameters |
|
734 |
+ if (normalization == "EDASeq_GC_peaks" | normalization == "gcqn_peaks") { |
|
735 |
+ |
|
736 |
+ # After either within-lane or between-lane normalization, the expression values are not counts anymore. |
|
737 |
+ # However, their distribution still shows some typical features of counts distribution (e.g., the variance depends on the mean). |
|
738 |
+ # Hence, for most applications, it is useful to round the normalized values to recover count-like values, which we refer to as “pseudo-counts”. |
|
739 |
+ # By default, both withinLaneNormalization and betweenLaneNormalization round the normalized values to the closest integer. |
|
740 |
+ # This behavior can be changed by specifying round=FALSE. This gives the user more flexibility and assures that rounding approximations do not affect subsequent computations (e.g., recovering the offset from the normalized counts). |
|
741 |
+ if ("roundResults" %in% names(additionalParams)) { |
|
742 |
+ roundResults = additionalParams$roundResults |
|
743 |
+ } else { |
|
744 |
+ roundResults = FALSE |
|
745 |
+ } |
|
746 |
+ |
|
747 |
+ checkmate::assertFlag(roundResults) |
|
748 |
+ |
|
749 |
+ if ("GC_data" %in% names(additionalParams)) { |
|
750 |
+ GC_data.df = additionalParams$GC_data |
|
751 |
+ } else { |
|
752 |
+ message = "GC_data is missing in additionalParams list." |
|
753 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
754 |
+ } |
|
755 |
+ |
|
756 |
+ } |
|
757 |
+ |
|
758 |
+ if (normalization == "limma_cyclicloess" | normalization == "limma_quantile" | normalization == "limma_scale") { |
|
759 |
+ |
|
760 |
+ futile.logger::flog.info(paste0(" Normalizing data using the package limma with the following method: ", normalization)) |
|
761 |
+ |
|
762 |
+ dataNorm = limma::normalizeBetweenArrays(data, method= sub("limma_", replacement = "", normalization)) |
|
763 |
+ |
|
764 |
+ } else if (normalization == "csaw_cyclicLoess_orig" | normalization == "csaw_TMM") { |
|
765 |
+ |
|
766 |
+ futile.logger::flog.info(paste0(" Normalizing data using the package csaw with the following method: ", normalization)) |
|
767 |
+ packageMessage = paste0("The package csaw is currently not installed, but however needed for the normalization methods \"csaw_cyclicLoess_orig\" and \"csaw_TMM\"") |
|
768 |
+ .checkPackageInstallation("csaw", packageMessage, isWarning = TRUE) |
|
769 |
+ |
|
770 |
+ if (packageVersion("csaw") <= "1.14.1") { |
|
771 |
+ message = "The version of the csaw package is too old, install at least version 1.14.1 or change the normalization method" |
|
772 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
773 |
+ } |
|
774 |
+ |
|
775 |
+ object = SummarizedExperiment::SummarizedExperiment(list(counts=data)) |
|
776 |
+ object$totals = colSums(data) |
|
777 |
+ |
|
778 |
+ if (normalization == "csaw_cyclicLoess_orig") { |
|
779 |
+ |
|
780 |
+ # Perform a cyclic loess normalization |
|
781 |
+ # We use a slighlty more complicated setup to derive size factors for library normalization |
|
782 |
+ # Instead of just determining the size factors in DeSeq2 via cirtual samples, we use |
|
783 |
+ # a normalization from the csaw package (see https://www.rdocumentation.org/packages/csaw/versions/1.6.1/topics/normOffsets) |
|
784 |
+ # and apply a non-linear normalization. |
|
785 |
+ # For each sample, a lowess curve is fitted to the log-counts against the log-average count. |
|
786 |
+ # The fitted value for each bin pair is used as the generalized linear model offset for that sample. |
|
787 |
+ # The use of the average count provides more stability than the average log-count when low counts are present for differentially bound regions. |
|
788 |
+ |
|
789 |
+ # since counts returns,by default, non-normalized counts, the following code should be fine and there is no need to also run estimateSizeFactors beforehand |
|
790 |
+ |
|
791 |
+ normFacs = csaw::normOffsets(object, se.out = FALSE) |
|
792 |
+ |
|
793 |
+ # the normalization factors matrix should not have 0's in it |
|
794 |
+ # it should have geometric mean near 1 for each row |
|
795 |
+ # exp(mean(log(normFacs[i,]))) for each row i |
|
796 |
+ normFacs <- normFacs / exp(rowMeans(log(normFacs))) |
|
797 |
+ |
|
798 |
+ rownames(normFacs) = rownames(data) |
|
799 |
+ colnames(normFacs) = colnames(data) |
|
800 |
+ |
|
801 |
+ futile.logger::flog.info(paste0(" Using the csaw-derived feature-specific normalization factors for DESeq, which will preempt sizeFactors")) |
|
802 |
+ |
|
803 |
+ |
|
804 |
+ DESeq2::normalizationFactors(dd) <- normFacs |
|
805 |
+ |
|
806 |
+ |
|
807 |
+ } else { # TMM |
|
808 |
+ |
|
809 |
+ # This function uses the trimmed mean of M-values (TMM) method to remove composition biases, typically in background regions of the genome. |
|
810 |
+ # The key difference from standard TMM is that precision weighting is turned off by default so as to avoid upweighting high-abundance regions. |
|
811 |
+ # These are more likely to be bound and thus more likely to be differentially bound. |
|
812 |
+ # Assigning excessive weight to such regions will defeat the purpose of trimming when normalizing the coverage of background regions. |
|
813 |
+ sizeFactors = csaw::normFactors(object, se.out = FALSE) |
|
814 |
+ |
|
815 |
+ futile.logger::flog.info(paste0(" Using the csaw-derived TMM-derived normalization factors as size factors, overriding the DESeq-default size factors.")) |
|
816 |
+ |
|
817 |
+ sizeFactors(dd) <- sizeFactors |
|
818 |
+ } |
|
819 |
+ |
|
820 |
+ dataNorm = DESeq2::counts(dd, normalized=TRUE) |
|
821 |
+ |
|
822 |
+ } else if (normalization == "EDASeq_GC_peaks") { |
|
823 |
+ |
|
824 |
+ packageMessage = paste0("The package EDASeq is currently not installed, but however needed for the normalization methods \"EDASeq_GC_peaks\"") |
|
825 |
+ .checkPackageInstallation("EDASeq", packageMessage, isWarning = TRUE) |
|
826 |
+ |
|
827 |
+ futile.logger::flog.info(paste0(" Normalizing data using the package EDASeq with the following method: ", normalization, " with the GC content as covariate.")) |
|
828 |
+ # https://bioconductor.org/packages/release/bioc/vignettes/EDASeq/inst/doc/EDASeq.html#normalization |
|
829 |
+ |
|
830 |
+ if ("nBins" %in% names(additionalParams)) { |
|
831 |
+ nBins = additionalParams$nBins |
|
832 |
+ } else { |
|
833 |
+ nBins = 10 |
|
834 |
+ } |
|
835 |
+ |
|
836 |
+ # We implemented four within-lane normalization methods, namely: |
|
837 |
+ # 1. loess robust local regression of read counts (log) on a gene feature such as GC-content (loess), |
|
838 |
+ # 2. global-scaling between feature strata using the median (median), |
|
839 |
+ # 3. global-scaling between feature strata using the upper-quartile (upper), |
|
840 |
+ # 4. and full-quantile normalization between feature strata (full). |
|
841 |
+ # For a discussion of these methods in context of GC-content normalization see (Risso et al. 2011). |
|
842 |
+ if ("withinLane_method" %in% names(additionalParams)) { |
|
843 |
+ withinLane_method = additionalParams$withinLane_method |
|
844 |
+ } else { |
|
845 |
+ withinLane_method = "full" |
|
846 |
+ } |
|
847 |
+ |
|
848 |
+ # Regarding between-lane normalization, the package implements three of the methods introduced in (Bullard et al. 2010): |
|
849 |
+ # global-scaling using the median (median), global-scaling using the upper-quartile (upper), and full-quantile normalization (full). |
|
850 |
+ if ("betweenLane_method" %in% names(additionalParams)) { |
|
851 |
+ betweenLane_method = additionalParams$withinLane_method |
|
852 |
+ } else { |
|
853 |
+ betweenLane_method = "full" |
|
854 |
+ } |
|
855 |
+ |
|
856 |
+ peaks_GC_fraction = GC_data.df$peak.GC.perc |
|
857 |
+ |
|
858 |
+ |
|
859 |
+ futile.logger::flog.info(paste0(" Using the following additional parameters for EDASeq: nBins = ", nBins, |
|
860 |
+ ", withinLane_method = ", withinLane_method, |
|
861 |
+ ", betweenLane_method = ", betweenLane_method, |
|
862 |
+ ", roundResults = ", roundResults, |
|
863 |
+ " as well as the automatically calculated peak GC content as covariate for withinLaneNormalization")) |
|
864 |
+ |
|
865 |
+ checkmate::assertNumeric(peaks_GC_fraction, lower = 0, upper = 1) |
|
866 |
+ checkmate::assertIntegerish(nBins, lower = 1, upper = 100) |
|
867 |
+ checkmate::assertChoice(withinLane_method, choices = c("loess","median","upper","full")) |
|
868 |
+ checkmate::assertChoice(betweenLane_method, choices = c("median","upper","full")) |
|
869 |
+ |
|
870 |
+ # Following (Risso et al. 2011), we consider two main types of effects on gene-level counts: |
|
871 |
+ # (1) within-lane gene-specific (and possibly lane-specific) effects, e.g., related to gene length or GC-content, and |
|
872 |
+ # (2) effects related to between-lane distributional differences, e.g., sequencing depth. |
|
873 |
+ # Accordingly, withinLaneNormalization and betweenLaneNormalization adjust for the first and second type of effects, respectively. |
|
874 |
+ # We recommend to normalize for within-lane effects prior to between-lane normalization. |
|
875 |
+ dataWithin <- withinLaneNormalization(data, y = peaks_GC_fraction, which= withinLane_method, num.bins = nBins, round = roundResults) |
|
876 |
+ dataNorm <- betweenLaneNormalization(dataWithin, which=betweenLane_method, round = roundResults) |
|
877 |
+ |
|
878 |
+ |
|
879 |
+ } else if (normalization == "gcqn_peaks") { |
|
880 |
+ |
|
881 |
+ futile.logger::flog.info(paste0(" Normalizing data using the GC-full-quantile (GC-FQ) normalization approach as described in Van den Berge et al. 2021 (https://doi.org/10.1101/2021.01.26.428252) using 50 bins.")) |
|
882 |
+ |
|
883 |
+ if ("nBins" %in% names(additionalParams)) { |
|
884 |
+ nBins = additionalParams$nBins |
|
885 |
+ } else { |
|
886 |
+ # Use the recommended 50 bins as described in the paper as default |
|
887 |
+ nBins = 50 |
|
888 |
+ } |
|
889 |
+ |
|
890 |
+ peaks_GC_class = cut(GC_data.df$peak.GC.perc, breaks = seq(0,1,1/nBins), include.lowest = TRUE, ordered_result = TRUE) |
|
891 |
+ |
|
892 |
+ dataNorm = .gcqn(data = data, |
|
893 |
+ GC_class = peaks_GC_class, |
|
894 |
+ summary='mean', round = roundResults) |
|
895 |
+ |
|
896 |
+ |
|
897 |
+ } else if (normalization == "DESeq2_sizeFactors") { |
|
898 |
+ |
|
899 |
+ futile.logger::flog.info(paste0(" Normalizing data using the package DESeq2 with a standard size factor normalization.")) |
|
900 |
+ |
|
901 |
+ dd = DESeq2::estimateSizeFactors(dd) |
|
902 |
+ dataNorm = DESeq2::counts(dd, normalized=TRUE) |
|
903 |
+ |
|
904 |
+ } else if (normalization == "none") { |
|
905 |
+ dataNorm = data |
|
906 |
+ futile.logger::flog.info(paste0(" Skip normalization.")) |
|
907 |
+ # Nothing to do, leave countsPeaks as they are |
|
908 |
+ } |
|
909 |
+ |
|
910 |
+ dataNorm |
|
911 |
+} |
|
912 |
+ |
|
913 |
+ |
|
914 |
+#The following functions are taken from https://github.com/koenvandenberge/bulkATACGC/blob/master/methods/gcqn_validated.R |
|
915 |
+ |
|
916 |
+### GCQN, first implementation |
|
917 |
+FQnorm <- function(counts, type="mean"){ |
|
918 |
+ rk <- apply(counts,2,rank,ties.method='min') |
|
919 |
+ counts.sort <- apply(counts,2,sort) |
|
920 |
+ if(type=="mean"){ |
|
921 |
+ # refdist <- apply(counts.sort,1,mean) |
|
922 |
+ refdist <- base::rowMeans(counts.sort) |
|
923 |
+ } else if(type=="median"){ |
|
924 |
+ #refdist <- apply(counts.sort,1,median) |
|
925 |
+ refdist <- matrixStats::rowMedians(counts.sort) |
|
926 |
+ } |
|
927 |
+ norm <- apply(rk,2,function(r){ refdist[r] }) |
|
928 |
+ rownames(norm) <- rownames(counts) |
|
929 |
+ return(norm) |
|
930 |
+} |
|
931 |
+ |
|
932 |
+ |
|
933 |
+# GC-full-quantile (GC-FQ) normalization. GC-FQ is similar to FQ-FQ, but relies on the observation that, in |
|
934 |
+# ATAC-seq, read count distributions are often more comparable between samples within a GC-content bin, than |
|
935 |
+# between GC-content bins within a sample (Figure 2). It therefore applies between-sample FQ normalization for |
|
936 |
+# each GC-content bin separately |
|
937 |
+.gcqn <- function(data, GC_class, summary='mean', round=FALSE){ |
|
938 |
+ |
|
939 |
+ gcBinNormCounts <- matrix(NA, nrow=nrow(data), ncol=ncol(data), dimnames=list(rownames(data),colnames(data))) |
|
940 |
+ |
|
941 |
+ for(ii in 1:nlevels(GC_class)){ |
|
942 |
+ |
|
943 |
+ id <- which(GC_class==levels(GC_class)[ii]) |
|
944 |
+ if(length(id) == 0) next |
|
945 |
+ if(length(id) == 1){ |
|
946 |
+ normCountBin <- data[id,] |
|
947 |
+ if(round) normCountBin <- round(normCountBin) |
|
948 |
+ gcBinNormCounts[id,] <- normCountBin |
|
949 |
+ next |
|
950 |
+ } |
|
951 |
+ countBin <- data[id,,drop=FALSE] |
|
952 |
+ if(summary=="mean"){ |
|
953 |
+ normCountBin <- FQnorm(countBin, type='mean') |
|
954 |
+ } else if(summary=="median"){ |
|
955 |
+ normCountBin <- FQnorm(countBin, type='median') |
|
956 |
+ } |
|
957 |
+ if(round) normCountBin <- round(normCountBin) |
|
958 |
+ normCountBin[normCountBin<0] <- 0 |
|
959 |
+ gcBinNormCounts[id,] <- normCountBin |
|
960 |
+ } |
|
961 |
+ return(gcBinNormCounts) |
|
962 |
+} |
|
963 |
+ |
|
964 |
+ |
|
965 |
+# peaksAnnotation = GRN@annotation$peaks |
|
966 |
+# counts = getCounts(GRN, type = "peaks", asMatrix = TRUE, includeFiltered = TRUE) |
|
967 |
+# Currently not applicable as we do not have different groups, qsmooth does not run with only one group |
|
968 |
+# |
|
969 |
+# .gcqn_qsmooth_mod <- function(counts, peaksAnnotation){ |
|
970 |
+# |
|
971 |
+# packageMessage = paste0("The package qsmooth is not installed, which is however needed for the chosen normalization method. Please install it and re-run this function or change the normalization method.") |
|
972 |
+# .checkPackageInstallation("qsmooth", packageMessage) |
|
973 |
+# |
|
974 |
+# groups_factors = factor(rep("A", length(GRN@config$sharedSamples))) |
|
975 |
+# |
|
976 |
+# gcBinNormCounts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts), dimnames=list(rownames(counts),colnames(counts))) |
|
977 |
+# for(ii in 1:nlevels(peaksAnnotation$peak.GC.class)){ |
|
978 |
+# id <- which(peaksAnnotation$peak.GC.class==levels(peaksAnnotation$peak.GC.class)[ii]) |
|
979 |
+# countBin <- counts[id,] |
|
980 |
+# qs <- qsmooth::qsmooth(countBin, group_factor=groups_factors) |
|
981 |
+# normCountBin <- qs@qsmoothData |
|
982 |
+# normCountBin <- round(normCountBin) |
|
983 |
+# normCountBin[normCountBin<0] <- 0 |
|
984 |
+# gcBinNormCounts[id,] <- normCountBin |
|
985 |
+# } |
|
986 |
+# return(gcBinNormCounts) |
|
987 |
+# } |
|
988 |
+ |
|
989 |
+ |
|
990 |
+ |
|
991 |
+ |
|
992 |
+ |
|
993 |
+ |
|
994 |
+# |
|
995 |
+# # Needed |
|
996 |
+# # Add DESeq2 normalization factors maybe? csaw stuff |
|
997 |
+# .normalizeCounts <- function(rawCounts, method = "quantile", ) { |
|
998 |
+# |
|
999 |
+# checkmate::assertChoice(idColumn, colnames(rawCounts)) |
|
1000 |
+# start = Sys.time() |
|
1001 |
+# |
|
1002 |
+# futile.logger::flog.info(paste0("Normalize counts. Method: ", method, ", ID column: ", idColumn)) |
|
1003 |
+# |
|
1004 |
+# |
|
1005 |
+# if (method == "quantile") { |
|
1006 |
+# |
|
1007 |
+# if (length(rmCols) > 0) { |
|
1008 |
+# input = as.matrix(rawCounts[,-rmCols]) |
|
1009 |
+# } else { |
|
1010 |
+# input = as.matrix(rawCounts) |
|
1011 |
+# } |
|
1012 |
+# |
|
1013 |
+# # We use limma for normalizing quantiles and NOT preprocessCore as before due to regression bugs for version >1.50 |
|
1014 |
+# counts.norm = limma::normalizeQuantiles(input) |
|
1015 |
+# |
|
1016 |
+# } else if (method == "DESeq_sizeFactor") { |
|
1017 |
+# |
|
1018 |
+# if (length(rmCols) > 0) { |
|
1019 |
+# sampleData.df = data.frame( sampleID = colnames(rawCounts)[-rmCols], stringsAsFactors = FALSE) |
|
1020 |
+# countDataNew = as.data.frame(rawCounts[, -rmCols]) |
|
1021 |
+# } else { |
|
1022 |
+# sampleData.df = data.frame( sampleID = colnames(rawCounts)) |
|
1023 |
+# countDataNew = as.data.frame(rawCounts) |
|
1024 |
+# } |
|
1025 |
+# |
|
1026 |
+# rownames(countDataNew) = ids |
|
1027 |
+# |
|
1028 |
+# stopifnot(identical(sampleData.df$sampleID, colnames(countDataNew))) |
|
1029 |
+# |
|
1030 |
+# dd <- DESeq2::DESeqDataSetFromMatrix(countData = countDataNew, |
|
1031 |
+# colData = sampleData.df, |
|
1032 |
+# design = stats::as.formula(" ~ 1")) |
|
1033 |
+# |
|
1034 |
+# dd = DESeq2::estimateSizeFactors(dd) |
|
1035 |
+# counts.norm = DESeq2::counts(dd, normalized = TRUE) |
|
1036 |
+# |
|
1037 |
+# if (returnDESeqObj) { |
|
1038 |
+# return(dd) |
|
1039 |
+# } |
|
1040 |
+# |
|
1041 |
+# |
|
1042 |
+# } else if (method == "none") { |
|
1043 |
+# |
|
1044 |
+# if (length(rmCols) > 0) { |
|
1045 |
+# counts.norm = rawCounts[,-rmCols] |
|
1046 |
+# } else { |
|
1047 |
+# counts.norm = rawCounts |
|
1048 |
+# } |
|
1049 |
+# |
|
1050 |
+# } else { |
|
1051 |
+# stop("Not implemented yet") |
|
1052 |
+# } |
|
1053 |
+# |
|
1054 |
+# .printExecutionTime(start) |
|
1055 |
+# |
|
1056 |
+# counts.norm = counts.norm %>% |
|
1057 |
+# as.data.frame() %>% |
|
1058 |
+# tibble::as_tibble() %>% |
|
1059 |
+# dplyr::mutate({{idColumn}} := ids) %>% |
|
1060 |
+# dplyr::select({{idColumn}}, tidyselect::everything()) |
|
1061 |
+# |
|
1062 |
+# colnames(counts.norm) = c(idColumn, colnames_samples) |
|
1063 |
+# |
|
1064 |
+# counts.norm |
|
1065 |
+# |
|
1066 |
+# } |
|
1067 |
+ |
|
1068 |
+ |
|
1069 |
+ |
|
652 | 1070 |
#' Filter RNA-seq and/or peak data from a \code{\linkS4class{GRN}} object |
653 | 1071 |
#' |
654 |
-#' This function marks genes and/or peaks as \code{filtered} depending on the chosen filtering criteria. Filtered genes / peaks will then be |
|
655 |
-#' disregarded when adding connections in subsequent steps via \code{\link{addConnections_TF_peak}} and \code{\link{addConnections_peak_gene}}. \strong{This function does NOT (re)filter existing connections when the \code{\linkS4class{GRN}} object already contains connections. Thus, upon re-execution of this function with different filtering criteria, all downstream steps have to be re-run.} |
|
1072 |
+#' This function marks genes and/or peaks as \code{filtered} depending on the chosen filtering criteria and is based on the count data AFTER |
|
1073 |
+#' potential normalization as chosen when using the \code{\link{addData}} function. Most of the filters may not be meaningful and useful anymore to apply |
|
1074 |
+#' after using particular normalization schemes that can give rise to, for example, negative values such as cyclic loess normalization. If normalized counts do |
|
1075 |
+#' not represents counts anymore but rather a deviation from a mean or something a like, the filtering critieria usually do not make sense anymore. |
|
1076 |
+#' Filtered genes / peaks will then be disregarded when adding connections in subsequent steps via \code{\link{addConnections_TF_peak}} and \code{\link{addConnections_peak_gene}}. \strong{This function does NOT (re)filter existing connections when the \code{\linkS4class{GRN}} object already contains connections. Thus, upon re-execution of this function with different filtering criteria, all downstream steps have to be re-run.} |
|
656 | 1077 |
#' |
657 | 1078 |
#' All this function does is setting (or modifying) the filtering flag in \code{GRN@data$peaks$counts_metadata} and \code{GRN@data$RNA$counts_metadata}, respectively. |
658 | 1079 |
#' |
659 | 1080 |
#' @template GRN |
660 | 1081 |
#' @param minNormalizedMean_peaks Numeric[0,] or \code{NULL}. Default 5. Minimum mean across all samples for a peak to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter. |
1082 |
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function. |
|
661 | 1083 |
#' @param maxNormalizedMean_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum mean across all samples for a peak to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter. |
1084 |
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function. |
|
662 | 1085 |
#' @param minNormalizedMeanRNA Numeric[0,] or \code{NULL}. Default 5. Minimum mean across all samples for a gene to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter. |
1086 |
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function. |
|
663 | 1087 |
#' @param maxNormalizedMeanRNA Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum mean across all samples for a gene to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter. |
1088 |
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function. |
|
664 | 1089 |
#' @param chrToKeep_peaks Character vector or \code{NULL}. Default \code{NULL}. Vector of chromosomes that peaks are allowed to come from. This filter can be used to filter sex chromosomes from the peaks, for example (e.g, \code{c(paste0("chr", 1:22), "chrX", "chrY")}) |
665 |
-#' @param minSize_peaks Integer[1,] or \code{NULL}. Default \code{NULL}. Minimum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter. |
|
1090 |
+#' @param minSize_peaks Integer[1,] or \code{NULL}. Default 20. Minimum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter. |
|
666 | 1091 |
#' @param maxSize_peaks Integer[1,] or \code{NULL}. Default 10000. Maximum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter. |
667 | 1092 |
#' @param minCV_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Minimum CV (coefficient of variation, a unitless measure of variation) for a peak to be retained. Set to \code{NULL} for not applying the filter. |
1093 |
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function. |
|
668 | 1094 |
#' @param maxCV_peaks Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum CV (coefficient of variation, a unitless measure of variation) for a peak to be retained. Set to \code{NULL} for not applying the filter. |
1095 |
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function. |
|
669 | 1096 |
#' @param minCV_genes Numeric[0,] or \code{NULL}. Default \code{NULL}. Minimum CV (coefficient of variation, a unitless measure of variation) for a gene to be retained. Set to \code{NULL} for not applying the filter. |
1097 |
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function. |
|
670 | 1098 |
#' @param maxCV_genes Numeric[0,] or \code{NULL}. Default \code{NULL}. Maximum CV (coefficient of variation, a unitless measure of variation) for a gene to be retained. Set to \code{NULL} for not applying the filter. |
1099 |
+#' Be aware that depending on the chosen normalization, this filter may not make sense and should NOT be applied. See the notes for this function. |
|
671 | 1100 |
#' @template forceRerun |
672 | 1101 |
#' @return An updated \code{\linkS4class{GRN}} object, with added data from this function. |
673 | 1102 |
#' @examples |
... | ... |
@@ -676,10 +1105,10 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
676 | 1105 |
#' GRN = filterData(GRN, forceRerun = FALSE) |
677 | 1106 |
#' @export |
678 | 1107 |
filterData <- function (GRN, |
679 |
- minNormalizedMean_peaks = 5, maxNormalizedMean_peaks = NULL, |
|
680 |
- minNormalizedMeanRNA = 1, maxNormalizedMeanRNA = NULL, |
|
1108 |
+ minNormalizedMean_peaks = NULL, maxNormalizedMean_peaks = NULL, |
|
1109 |
+ minNormalizedMeanRNA = NULL, maxNormalizedMeanRNA = NULL, |
|
681 | 1110 |
chrToKeep_peaks = NULL, |
682 |
- minSize_peaks = NULL, maxSize_peaks = 10000, |
|
1111 |
+ minSize_peaks = 20, maxSize_peaks = 10000, |
|
683 | 1112 |
minCV_peaks = NULL, maxCV_peaks = NULL, |
684 | 1113 |
minCV_genes = NULL, maxCV_genes = NULL, |
685 | 1114 |
forceRerun = FALSE) { |
... | ... |
@@ -761,13 +1190,13 @@ filterData <- function (GRN, |
761 | 1190 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
762 | 1191 |
} |
763 | 1192 |
|
764 |
- rowMeans = rowMeans(getCounts(GRN, type = "rna", asMatrix = TRUE, includeFiltered = TRUE)) |
|
765 |
- GRN@data$RNA$counts_metadata$isFiltered = rowMeans < minNormalizedMeanRNA |
|
1193 |
+ |
|
1194 |
+ GRN@data$RNA$counts_metadata$isFiltered = ! GRN@data$RNA$counts_metadata$ID %in% genes.CV |
|
766 | 1195 |
|
767 | 1196 |
nRowsFlagged = length(which(GRN@data$RNA$counts_metadata$isFiltered)) |
768 | 1197 |
|
769 | 1198 |
# Raw counts are left untouched and filtered where needed only |
770 |
- futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows because the row mean was smaller than ", minNormalizedMeanRNA)) |
|
1199 |
+ futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows due to filtering criteria")) |
|
771 | 1200 |
|
772 | 1201 |
GRN@config$isFiltered = TRUE |
773 | 1202 |
} |
... | ... |
@@ -793,7 +1222,7 @@ filterData <- function (GRN, |
793 | 1222 |
} |
794 | 1223 |
|
795 | 1224 |
|
796 |
- futile.logger::flog.info(paste0("Filter and sort peaks by size and remain only those smaller than : ", maxSize_peaks)) |
|
1225 |
+ futile.logger::flog.info(paste0("Filter and sort peaks by size and remain only those smaller than : ", maxSize_peaks, " and bigger than ", minSize_peaks)) |
|
797 | 1226 |
futile.logger::flog.info(paste0(" Number of peaks before filtering: ", nrow(GRN@data$peaks$counts_metadata))) |
798 | 1227 |
|
799 | 1228 |
countsPeaks.clean = GRN@data$peaks$counts_metadata %>% |
... | ... |
@@ -1366,7 +1795,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac |
1366 | 1795 |
tibble::as_tibble() |
1367 | 1796 |
|
1368 | 1797 |
# TODO replace by getter |
1369 |
- countsPeaks = .normalizeNewDataWrapper(GRN@data$peaks$counts_orig, normalization = normalization) |
|
1798 |
+ countsPeaks = .normalizeCountMatrix(GRN@data$peaks$counts_orig, normalization = normalization) |
|
1370 | 1799 |
|
1371 | 1800 |
stopifnot(identical(nrow(countsPeaks), nrow(GRN@data$TFs$TF_peak_overlap))) |
1372 | 1801 |
|
... | ... |
@@ -1403,9 +1832,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac |
1403 | 1832 |
|
1404 | 1833 |
# Store as data frame with both TF names and Ensembl IDs, in analogy to the other types of TF data that can be imported |
1405 | 1834 |
GRN@data$TFs[[name]] = TF.activity.m %>% |
1406 |
- as.data.frame() %>% |
|
1407 |
- tibble::rownames_to_column("TF.name") %>% |
|
1408 |
- tibble::as_tibble() %>% |
|
1835 |
+ tibble::as_tibble(rownames = "TF.name") %>% |
|
1409 | 1836 |
dplyr::left_join(GRN@annotation$TFs, by = "TF.name") %>% |
1410 | 1837 |
dplyr::select(.data$ENSEMBL, .data$TF.name, tidyselect::all_of(GRN@config$sharedSamples)) |
1411 | 1838 |
|
... | ... |
@@ -1425,77 +1852,6 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac |
1425 | 1852 |
|
1426 | 1853 |
} |
1427 | 1854 |
|
1428 |
-.normalizeNewDataWrapper <- function(data, normalization, idColumn = "peakID") { |
|
1429 |
- |
|
1430 |
- if (checkmate::testClass(data, "DESeqDataSet")) { |
|
1431 |
- |
|
1432 |
- counts_raw = DESeq2::counts(data, normalized = FALSE) |
|
1433 |
- |
|
1434 |
- } else { |
|
1435 |
- |
|
1436 |
- # Capture incompatible cases |
|
1437 |
- if (normalization == "cyclicLoess" | normalization == "sizeFactors") { |
|
1438 |
- message = paste0("Selected normalization method for TF activity (", normalization, ") cannot be performed because the provided counts are not integer only. Select either \"quantile\" or \"none\" as normalization method for TF activity.") |
|
1439 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
1440 |
- } |
|
1441 |
- |
|
1442 |
- counts_raw = data |
|
1443 |
- } |
|
1444 |
- |
|
1445 |
- |
|
1446 |
- if (normalization == "cyclicLoess") { |
|
1447 |
- |
|
1448 |
- futile.logger::flog.info(paste0(" Normalizing data using cyclic LOESS")) |
|
1449 |
- |
|
1450 |
- |
|
1451 |
- packageMessage = paste0("The package csaw is not installed but required for the cyclic LOESS normalization. Please install it and re-run this function or change the normalization type (if possible).") |
|
1452 |
- .checkPackageInstallation("csaw", packageMessage) |
|
1453 |
- |
|
1454 |
- # Perform a cyclic loess normalization |
|
1455 |
- # We use a slighlty more complicated setup to derive size factors for library normalization |
|
1456 |
- # Instead of just determining the size factors in DeSeq2 via cirtual samples, we use |
|
1457 |
- # a normalization from the csaw package (see https://www.rdocumentation.org/packages/csaw/versions/1.6.1/topics/normOffsets) |
|
1458 |
- # and apply a non-linear normalization. |
|
1459 |
- # For each sample, a lowess curve is fitted to the log-counts against the log-average count. |
|
1460 |
- # The fitted value for each bin pair is used as the generalized linear model offset for that sample. |
|
1461 |
- # The use of the average count provides more stability than the average log-count when low counts are present for differentially bound regions. |
|
1462 |
- |
|
1463 |
- # since counts returns,by default, non-normalized counts, the following code should be fine and there is no need to also run estimateSizeFactors beforehand |
|
1464 |
- |
|
1465 |
- if (packageVersion("csaw") <= "1.14.1") { |
|
1466 |
- normFacs = exp((csaw::normOffsets(data, lib.sizes = colSums(data), type = "loess"))) |
|
1467 |
- } else { |
|
1468 |
- object = SummarizedExperiment::SummarizedExperiment(list(counts=counts_raw)) |
|
1469 |
- object$totals = colSums(counts_raw) |
|
1470 |
- normFacs = exp(csaw::normOffsets(object, se.out = FALSE)) |
|
1471 |
- } |
|
1472 |
- |
|
1473 |
- rownames(normFacs) = rownames(data) |
|
1474 |
- colnames(normFacs) = colnames(data) |
|
1475 |
- |
|
1476 |
- # We now provide gene-specific normalization factors for each sample as a matrix, which will preempt sizeFactors |
|
1477 |
- DESeq2::normalizationFactors(data) <- normFacs |
|
1478 |
- dataNorm = DESeq2::counts(data, normalized=TRUE) |
|
1479 |
- |
|
1480 |
- } else if (normalization == "sizeFactors") { |
|
1481 |
- |
|
1482 |
- futile.logger::flog.info(paste0(" Normalizing data using DESeq size factors")) |
|
1483 |
- data = DESeq2::estimateSizeFactors(data) |
|
1484 |
- dataNorm = DESeq2::counts(data, normalized=TRUE) |
|
1485 |
- |
|
1486 |
- } else if (normalization == "quantile") { |
|
1487 |
- |
|
1488 |
- futile.logger::flog.info(paste0(" Normalizing data using quantile normalization")) |
|
1489 |
- dataNorm = .normalizeCounts(data, method = "quantile", idColumn = idColumn) |
|
1490 |
- |
|
1491 |
- } else if (normalization == "none") { |
|
1492 |
- dataNorm = data |
|
1493 |
- futile.logger::flog.info(paste0(" Skip normalization.")) |
|
1494 |
- # Nothing to do, leave countsPeaks as they are |
|
1495 |
- } |
|
1496 |
- |
|
1497 |
- dataNorm |
|
1498 |
-} |
|
1499 | 1855 |
|
1500 | 1856 |
#' Import externally derived TF Activity data. EXPERIMENTAL. |
1501 | 1857 |
#' |
... | ... |
@@ -1565,7 +1921,8 @@ importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF |
1565 | 1921 |
# data = dplyr::select(data, -tidyselect::one_of(nameColumn)) |
1566 | 1922 |
|
1567 | 1923 |
# Only TF.names are unique |
1568 |
- countsNorm = .normalizeNewDataWrapper(data %>% dplyr::select(-.data$ENSEMBL), normalization = normalization, idColumn = "TF.name") |
|
1924 |
+ # TODO fix |
|
1925 |
+ countsNorm = .normalizeCountMatrix(data %>% dplyr::select(-.data$ENSEMBL), normalization = normalization) |
|
1569 | 1926 |
|
1570 | 1927 |
# Check overlap of ENSEMBL IDs |
1571 | 1928 |
countsNorm$ENSEMBL = data$ENSEMBL |
... | ... |
@@ -37,106 +37,6 @@ |
37 | 37 |
} |
38 | 38 |
|
39 | 39 |
|
40 |
-# Needed |
|
41 |
-.normalizeCounts <- function(rawCounts, method = "quantile", idColumn, removeCols = c(), returnDESeqObj = FALSE) { |
|
42 |
- |
|
43 |
- start = Sys.time() |
|
44 |
- |
|
45 |
- futile.logger::flog.info(paste0("Normalize counts. Method: ", method, ", ID column: ", idColumn)) |
|
46 |
- |
|
47 |
- if (is.null(idColumn)) { |
|
48 |
- rawCounts = as.data.frame(rawCounts) %>% |
|
49 |
- tibble::rownames_to_column("ENSEMBL") |
|
50 |
- idColumn = "ENSEMBL" |
|
51 |
- } |
|
52 |
- |
|
53 |
- ids = as.character(unlist(rawCounts[,idColumn])) |
|
54 |
- |
|
55 |
- # Test for additional character columns |
|
56 |
- colTypes = sapply(rawCounts, class) |
|
57 |
- colTypes_rm = colTypes[which(colTypes == "character" | colTypes == "factor" | |
|
58 |
- names(colTypes) %in% removeCols)] |
|
59 |
- |
|
60 |
- if (length(colTypes_rm) > 1) { |
|
61 |
- colnames_rm = names(colTypes_rm)[which(names(colTypes_rm) != idColumn)] |
|
62 |
- futile.logger::flog.info(paste0("Remove the following columns because they do not represent counts: ", paste0(colnames_rm, collapse = ","))) |
|
63 |
- colnames_rm = c(colnames_rm, idColumn) |
|
64 |
- } else { |
|
65 |
- colnames_rm = c(idColumn) |
|
66 |
- } |
|
67 |
- |
|
68 |
- if (length(colnames_rm) > 0) { |
|
69 |
- colnames_samples = colnames(rawCounts)[-which(colnames(rawCounts) %in% colnames_rm)] |
|
70 |
- } else { |
|
71 |
- colnames_samples = colnames(rawCounts) |
|
72 |
- } |
|
73 |
- |
|
74 |
- rmCols = which(colnames(rawCounts) %in% colnames_rm) |
|
75 |
- |
|
76 |
- if (method == "quantile") { |
|
77 |
- |
|
78 |
- if (length(rmCols) > 0) { |
|
79 |
- input = as.matrix(rawCounts[,-rmCols]) |
|
80 |
- } else { |
|
81 |
- input = as.matrix(rawCounts) |
|
82 |
- } |
|
83 |
- |
|
84 |
- # We use limma for normalizing quantiles and NOT preprocessCore as before due to regression bugs for version >1.50 |
|
85 |
- counts.norm = limma::normalizeQuantiles(input) |
|
86 |
- |
|
87 |
- } else if (method == "DESeq_sizeFactor") { |
|
88 |
- |
|
89 |
- if (length(rmCols) > 0) { |
|
90 |
- sampleData.df = data.frame( sampleID = colnames(rawCounts)[-rmCols], stringsAsFactors = FALSE) |
|
91 |
- countDataNew = as.data.frame(rawCounts[, -rmCols]) |
|
92 |
- } else { |
|
93 |
- sampleData.df = data.frame( sampleID = colnames(rawCounts)) |
|
94 |
- countDataNew = as.data.frame(rawCounts) |
|
95 |
- } |
|
96 |
- |
|
97 |
- rownames(countDataNew) = ids |
|
98 |
- |
|
99 |
- stopifnot(identical(sampleData.df$sampleID, colnames(countDataNew))) |
|
100 |
- |
|
101 |
- dd <- DESeq2::DESeqDataSetFromMatrix(countData = countDataNew, |
|
102 |
- colData = sampleData.df, |
|
103 |
- design = stats::as.formula(" ~ 1")) |
|
104 |
- |
|
105 |
- dd = DESeq2::estimateSizeFactors(dd) |
|
106 |
- counts.norm = DESeq2::counts(dd, normalized = TRUE) |
|
107 |
- |
|
108 |
- if (returnDESeqObj) { |
|
109 |
- return(dd) |
|
110 |
- } |
|
111 |
- |
|
112 |
- |
|
113 |
- } else if (method == "none") { |
|
114 |
- |
|
115 |
- if (length(rmCols) > 0) { |
|
116 |
- counts.norm = rawCounts[,-rmCols] |
|
117 |
- } else { |
|
118 |
- counts.norm = rawCounts |
|
119 |
- } |
|
120 |
- |
|
121 |
- } else { |
|
122 |
- stop("Not implemented yet") |
|
123 |
- } |
|
124 |
- |
|
125 |
- .printExecutionTime(start) |
|
126 |
- |
|
127 |
- counts.norm = counts.norm %>% |
|
128 |
- as.data.frame() %>% |
|
129 |
- tibble::as_tibble() %>% |
|
130 |
- dplyr::mutate({{idColumn}} := ids) %>% |
|
131 |
- dplyr::select({{idColumn}}, tidyselect::everything()) |
|
132 |
- |
|
133 |
- colnames(counts.norm) = c(idColumn, colnames_samples) |
|
134 |
- |
|
135 |
- counts.norm |
|
136 |
- |
|
137 |
-} |
|
138 |
- |
|
139 |
- |
|
140 | 40 |
# Needed |
141 | 41 |
.intersectData <- function(countsRNA, countsPeaks, idColumn_RNA = "ENSEMBL", idColumn_peaks = "peakID") { |
142 | 42 |
|
... | ... |
@@ -105,7 +105,7 @@ |
105 | 105 |
# PACKAGE LOADING AND DETACHING FUNCTIONS # |
106 | 106 |
########################################### |
107 | 107 |
|
108 |
-.checkPackageInstallation <- function(pkg, message, isWarning = FALSE) { |
|
108 |
+.checkPackageInstallation <- function(pkg, message, isWarning = FALSE, returnLogical = FALSE) { |
|
109 | 109 |
|
110 | 110 |
pkgMissing = c() |
111 | 111 |
for (packageCur in pkg) { |
... | ... |
@@ -118,26 +118,40 @@ |
118 | 118 |
message = paste0(message, "\n\nExecute the following line in R to install the missing package(s): `BiocManager::install(c(\"", |
119 | 119 |
paste0(pkgMissing, collapse = "\",\""), |
120 | 120 |
"\"))`") |
121 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = isWarning) |
|
121 |
+ |
|
122 |
+ # Normal behavior |
|
123 |
+ if (!returnLogical) { |
|
124 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = isWarning) |
|
125 |
+ } else { |
|
126 |
+ |
|
127 |
+ # Make it just a warning but return FALSE |
|
128 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) |
|
129 |
+ return(FALSE) |
|
130 |
+ } |
|
131 |
+ |
|
132 |
+ } else { |
|
133 |
+ if (returnLogical) return(TRUE) |
|
122 | 134 |
} |
123 | 135 |
|
124 | 136 |
|
125 | 137 |
} |
126 | 138 |
|
127 | 139 |
|
128 |
-.checkAndLoadPackagesGenomeAssembly <- function(genomeAssembly) { |
|
140 |
+.checkAndLoadPackagesGenomeAssembly <- function(genomeAssembly, returnLogical = FALSE) { |
|
129 | 141 |
|
130 |
- baseMessage = paste0("For the chosen genome assembly version, particular genome annotation packages are required but not installed. Please install and re-run this function.") |
|
142 |
+ baseMessage = paste0("For the chosen genome assembly version and package function, particular genome annotation packages are required but not installed. Please install and re-run this function.") |
|
131 | 143 |
|
132 | 144 |
if (genomeAssembly == "hg38") { |
133 |
- .checkPackageInstallation(c("org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene", "BSgenome.Hsapiens.UCSC.hg38"), baseMessage) |
|
145 |
+ res = .checkPackageInstallation(c("org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene", "BSgenome.Hsapiens.UCSC.hg38"), baseMessage, returnLogical = returnLogical) |
|
134 | 146 |
} else if (genomeAssembly == "hg19") { |
135 |
- .checkPackageInstallation(c("org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene", "BSgenome.Hsapiens.UCSC.hg19"), baseMessage) |
|
147 |
+ res = .checkPackageInstallation(c("org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene", "BSgenome.Hsapiens.UCSC.hg19"), baseMessage, returnLogical = returnLogical) |
|
136 | 148 |
} else if (genomeAssembly == "mm10") { |
137 |
- .checkPackageInstallation(c("org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene", "BSgenome.Mmusculus.UCSC.mm10"), baseMessage) |
|
149 |
+ res = .checkPackageInstallation(c("org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene", "BSgenome.Mmusculus.UCSC.mm10"), baseMessage, returnLogical = returnLogical) |
|
138 | 150 |
} else if (genomeAssembly == "mm9") { |
139 |
- .checkPackageInstallation(c("org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene", "BSgenome.Mmusculus.UCSC.mm9"), baseMessage) |
|
151 |
+ res = .checkPackageInstallation(c("org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene", "BSgenome.Mmusculus.UCSC.mm9"), baseMessage, returnLogical = returnLogical) |
|
140 | 152 |
} |
153 |
+ |
|
154 |
+ if (returnLogical) return(res) |
|
141 | 155 |
|
142 | 156 |
} |
143 | 157 |
|
... | ... |
@@ -415,7 +429,7 @@ |
415 | 429 |
} |
416 | 430 |
|
417 | 431 |
|
418 |
- |
|
432 |
+# Only needed here: get CG content peaks (can be made optional), and .populatePeakAnnotation (within ChipSeeker) |
|
419 | 433 |
.getGenomeObject <- function(genomeAssembly, type = "txbd") { |
420 | 434 |
|
421 | 435 |
checkmate::assertChoice(type, c("txbd", "BSgenome", "packageName")) |
... | ... |
@@ -467,9 +481,22 @@ |
467 | 481 |
obj |
468 | 482 |
} |
469 | 483 |
|
484 |
+# .getChrLengths <- function(genomeAssembly) { |
|
485 |
+# txdb = .getGenomeObject(genomeAssembly) |
|
486 |
+# GenomeInfoDb::seqlengths(txdb) |
|
487 |
+# } |
|
488 |
+ |
|
489 |
+# Try an approach that is lighter and doesnt require large annotation packages |
|
490 |
+ |
|
491 |
+#' @import GenomeInfoDb |
|
470 | 492 |
.getChrLengths <- function(genomeAssembly) { |
471 |
- txdb = .getGenomeObject(genomeAssembly) |
|
472 |
- GenomeInfoDb::seqlengths(txdb) |
|
493 |
+ |
|
494 |
+ chrSizes = GenomeInfoDb::getChromInfoFromUCSC(genomeAssembly) |
|
495 |
+ |
|
496 |
+ chrSizes.vec = chrSizes$size |
|
497 |
+ names(chrSizes.vec) = chrSizes$chrom |
|
498 |
+ |
|
499 |
+ chrSizes.vec |
|
473 | 500 |
} |
474 | 501 |
|
475 | 502 |
|
... | ... |
@@ -608,7 +608,7 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
608 | 608 |
# maxGSSize = maxGSSize, |
609 | 609 |
# pAdjustMethod = pAdjustMethod) |
610 | 610 |
|
611 |
- # go.res.new = .createEnichmentTable(go_enrichment) |
|
611 |
+ # go.res.new = .createEnrichmentTable(go_enrichment) |
|
612 | 612 |
|
613 | 613 |
# The need of p-value adjustment: https://bioconductor.org/packages/devel/bioc/vignettes/topGO/inst/doc/topGO.pdf |
614 | 614 |
|
... | ... |
@@ -679,7 +679,7 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
679 | 679 |
} |
680 | 680 |
) |
681 | 681 |
|
682 |
- result.list[["results"]] = .createEnichmentTable(kegg_enrichment) |
|
682 |
+ result.list[["results"]] = .createEnrichmentTable(kegg_enrichment) |
|
683 | 683 |
|
684 | 684 |
} |
685 | 685 |
|
... | ... |
@@ -706,7 +706,7 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
706 | 706 |
} |
707 | 707 |
) |
708 | 708 |
|
709 |
- result.list[["results"]] = .createEnichmentTable(reactome_enrichment) |
|
709 |
+ result.list[["results"]] = .createEnrichmentTable(reactome_enrichment) |
|
710 | 710 |
|
711 | 711 |
} |
712 | 712 |
|
... | ... |
@@ -734,7 +734,7 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
734 | 734 |
} |
735 | 735 |
) |
736 | 736 |
|
737 |
- result.list[["results"]] = .createEnichmentTable(DO_enrichment) |
|
737 |
+ result.list[["results"]] = .createEnrichmentTable(DO_enrichment) |
|
738 | 738 |
|
739 | 739 |
} |
740 | 740 |
|
... | ... |
@@ -760,7 +760,7 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
760 | 760 |
} |
761 | 761 |
|
762 | 762 |
|
763 |
-.createEnichmentTable <- function (enrichmentObj) { |
|
763 |
+.createEnrichmentTable <- function (enrichmentObj) { |
|
764 | 764 |
|
765 | 765 |
if (!is.null(enrichmentObj)) { |
766 | 766 |
|
767 | 767 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,172 @@ |
1 |
+ |
|
2 |
+plotCorrelations <- function(GRN, TF_peak.fdr.threshold = 0.2, TF_peak.r.abs.threshold = 0.3, randomized = TRUE, |
|
3 |
+ topn = 100, maxAll = 30000, TF.names = NULL, peak.IDs = NULL, |
|
4 |
+ corMethod = "pearson", filePDF) { |
|
5 |
+ |
|
6 |
+ checkmate::assertChoice(sortby, colnames(GRN@connections$TF_peaks$`0`$main)) |
|
7 |
+ |
|
8 |
+ con.filt = GRN@connections$TF_peaks$`0`$main %>% |
|
9 |
+ dplyr::mutate(TF_peak.r_abs = abs(TF_peak.r)) %>% |
|
10 |
+ dplyr::filter(.data$TF_peak.fdr <= TF_peak.fdr.threshold, |
|
11 |
+ .data$TF_peak.r_abs >= TF_peak.r.abs.threshold) |
|
12 |
+ |
|
13 |
+ |
|
14 |
+ if (!is.null(TF.names)) { |
|
15 |
+ con.filt = dplyr::filter(con.filt, .data$Tf.name %in% TF.names) |
|
16 |
+ } |
|
17 |
+ |
|
18 |
+ if (!is.null(peak.IDs)) { |
|
19 |
+ con.filt = dplyr::filter(con.filt, .data$peak.ID %in% peak.IDs) |
|
20 |
+ } |
|
21 |
+ |
|
22 |
+ if (randomized) { |
|
23 |
+ con.filt = dplyr::slice_sample(con.filt, n = topn) |
|
24 |
+ } else { |
|
25 |
+ con.filt = con.filt %>% |
|
26 |
+ dplyr::arrange(desc(TF_peak.r_abs)) %>% |
|
27 |
+ dplyr::slice_head(con.filt, n = topn) |
|
28 |
+ } |
|
29 |
+ |
|
30 |
+ pdf(filePDF, height = 10) |
|
31 |
+ |
|
32 |
+ for (i in seq_len(topn)) { |
|
33 |
+ |
|
34 |
+ TF.name = con.filt$TF.name[i] |
|
35 |
+ TF.ENSEMBL = GRN@annotation$TFs$TF.ENSEMBL[which(GRN@annotation$TFs$TF.name == TF.name)] |
|
36 |
+ peak.peakID = con.filt$peak.ID[i] |
|
37 |
+ corCalc = con.filt$TF_peak.r[i] |
|
38 |
+ fdrCur = con.filt$TF_peak.fdr[i] |
|
39 |
+ |
|
40 |
+ cor_cur = cor(GRN@data$RNA$counts[TF.ENSEMBL, ], GRN@data$peaks$counts[peak.peakID,], method = corMethod) |
|
41 |
+ |
|
42 |
+ data.df = tibble(TF.exp.norm = GRN@data$RNA$counts[TF.ENSEMBL, ], peak.acc.norm = GRN@data$peaks$counts[peak.peakID,]) |
|
43 |
+ |
|
44 |
+ # For testing only |
|
45 |
+ stopifnot(abs(cor_cur - corCalc) < 0.01) |
|
46 |
+ |
|
47 |
+ colorscale = scale_fill_gradientn( |
|
48 |
+ colors = RColorBrewer::brewer.pal(9, "YlGnBu"), |
|
49 |
+ values = c(0, exp(seq(-5, 0, length.out = 100)))) |
|
50 |
+ |
|
51 |
+ g1 = ggplot(data.df, aes(.data$TF.exp.norm, .data$peak.acc.norm)) + |
|
52 |
+ geom_smooth(method = "lm", formula = "y ~ x", col = "black") + |
|
53 |
+ geom_hex(bins = 50) + colorscale + |
|
54 |
+ xlim(-0.01,NA) + ylim(-0.01,NA) + |
|
55 |
+ ggtitle(paste0("n = ", nrow(data.df), " (all), Cor = ", round(corCalc,2))) |
|
56 |
+ |
|
57 |
+ # Remove entries with pure zeroes |
|
58 |
+ data.filt.df = dplyr::filter(data.df, .data$TF.exp.norm > 0, .data$peak.acc.norm > 0) |
|
59 |
+ |
|
60 |
+ cor_cur = cor(dplyr::pull(data.filt.df, .data$TF.exp.norm), dplyr::pull(data.filt.df, .data$peak.acc.norm), method = corMethod) |
|
61 |
+ |
|
62 |
+ g2 = ggplot(data.filt.df, aes(TF.exp.norm, peak.acc.norm)) + |
|
63 |
+ geom_smooth(method = "lm", formula = "y ~ x", col = "black") + |
|
64 |
+ geom_hex(bins = 50) + colorscale + |
|
65 |
+ xlim(-0.01,NA) + ylim(-0.01,NA) + |
|
66 |
+ ggtitle(paste0("n = ", nrow(data.filt.df), " (only >0 for both TF expr. & peak acc.), Cor = ", round(cor_cur,2))) |
|
67 |
+ |
|
68 |
+ data.filt2.df = dplyr::filter(data.df, TF.exp.norm > 0 | peak.acc.norm > 0) |
|
69 |
+ |
|
70 |
+ cor_cur = cor(dplyr::pull(data.filt2.df, .data$TF.exp.norm), dplyr::pull(data.filt2.df, .data$peak.acc.norm), method = corMethod) |
|
71 |
+ |
|
72 |
+ g3 = ggplot(data.filt2.df, aes(TF.exp.norm, peak.acc.norm)) + |
|
73 |
+ geom_smooth(method = "lm", formula = "y ~ x", col = "black") + |
|
74 |
+ geom_hex(bins = 50) + colorscale + |
|
75 |
+ xlim(-0.01,NA) + ylim(-0.01,NA) + |
|
76 |
+ ggtitle(paste0("n = ", nrow(data.filt2.df), " (only excluding (0,0) for both TF expr. & peak acc.), Cor = ", round(cor_cur,2))) |
|
77 |
+ |
|
78 |
+ |
|
79 |
+ mainTitle = paste0("TF: ", TF.name, "(", TF.ENSEMBL, "), peak: ", peak.peakID, " (FDR: ", round(fdrCur, 2), ")") |
|
80 |
+ |
|
81 |
+ plots_all = g1 / g2 / g3 + |
|
82 |
+ patchwork::plot_annotation(title = mainTitle, theme = ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))) |
|
83 |
+ plot(plots_all) |
|
84 |
+ # |
|
85 |
+ # r1 = rnorm(1000) |
|
86 |
+ # r2 = rnorm(1000, sd = 1) |
|
87 |
+ # plot(r1,r2, main = paste0("Cor = ", round(cor(r1,r2, method = corMethod), 2))) |
|
88 |
+ # |
|
89 |
+ # r1 = c(r1, rep(0,5000)) |
|
90 |
+ # r2 = c(r2, rep(0,5000)) |
|
91 |
+ # plot(r1,r2, main = paste0("Cor = ", round(cor(r1,r2, method = corMethod), 2))) |
|
92 |
+ } |
|
93 |
+ |
|
94 |
+ if (maxAll > 0) { |
|
95 |
+ |
|
96 |
+ con.filt = GRN@connections$TF_peaks$`0`$main |
|
97 |
+ nRowsReal = min(maxAll, nrow(con.filt)) |
|
98 |
+ |
|
99 |
+ randomRows = sample.int(nrow(con.filt), nRowsReal) |
|
100 |
+ |
|
101 |
+ res.df = data.frame(matrix(NA, nrow = nRowsReal, ncol = 5)) |
|
102 |
+ colnames(res.df) = c("TF.name", "peak.peakID", "cor_all", "cor_nonZeroAll", "cor_nonZero") |
|
103 |
+ #res.df = tribble(~TF.name, ~peak.peakID, ~cor_all, ~cor_nonZeroAll, ~cor_nonZero) |
|
104 |
+ for (i in 1:nRowsReal) { |
|
105 |
+ |
|
106 |
+ if (i %% 1000 == 1) futile.logger::flog.info(paste0("Row ", i, " out of ", nRowsReal)) |
|
107 |
+ index =randomRows[i] |
|
108 |
+ TF.name = con.filt$TF.name[index] |
|
109 |
+ TF.ENSEMBL = GRN@annotation$TFs$TF.ENSEMBL[which(GRN@annotation$TFs$TF.name == TF.name)] |
|
110 |
+ peak.peakID = con.filt$peak.ID[index] |
|
111 |
+ corCalc = con.filt$TF_peak.r[index] |
|
112 |
+ |
|
113 |
+ |
|
114 |
+ #cor_cur1 = cor(GRN@data$RNA$counts[TF.ENSEMBL, ], GRN@data$peaks$counts[peak.peakID,], method = corMethod) |
|
115 |
+ x_orig = GRN@data$RNA$counts[TF.ENSEMBL, ] |
|
116 |
+ y_orig = GRN@data$peaks$counts[peak.peakID,] |
|
117 |
+ keep = which(x_orig > 0 & y_orig > 0) |
|
118 |
+ x = x_orig[keep] |
|
119 |
+ y = y_orig[keep] |
|
120 |
+ |
|
121 |
+ #data.df = tibble(TF.exp.norm = GRN@data$RNA$counts[TF.ENSEMBL, ], peak.acc.norm = GRN@data$peaks$counts[peak.peakID,]) |
|
122 |
+ |
|
123 |
+ #data.filt.df = dplyr::filter(data.df, .data$TF.exp.norm > 0, .data$peak.acc.norm > 0) |
|
124 |
+ |
|
125 |
+ #cor_cur2 = cor(dplyr::pull(data.filt.df, .data$TF.exp.norm), dplyr::pull(data.filt.df, .data$peak.acc.norm), method = corMethod) |
|
126 |
+ cor_cur2 = cor(x, y, method = corMethod) |
|
127 |
+ |
|
128 |
+ keep = which(x_orig > 0 | y_orig > 0) |
|
129 |
+ x = x_orig[keep] |
|
130 |
+ y = y_orig[keep] |
|
131 |
+ |
|
132 |
+ # data.filt2.df = dplyr::filter(data.df, .data$TF.exp.norm > 0 | peak.acc.norm > 0) |
|
133 |
+ cor_cur3 = cor(x, y, method = corMethod) |
|
134 |
+ # cor_cur3 = cor(dplyr::pull(data.filt2.df, .data$TF.exp.norm), dplyr::pull(data.filt2.df, .data$peak.acc.norm), method = corMethod) |
|
135 |
+ res.df[i,] = c(as.character(TF.name), as.character(peak.peakID), as.numeric(corCalc), as.numeric(cor_cur2), as.numeric(cor_cur3)) |
|
136 |
+ |
|
137 |
+ # res.df = add_row(res.df, |
|
138 |
+ # TF.name = TF.name, peak.peakID = peak.peakID, |
|
139 |
+ # cor_all = corCalc, cor_nonZeroAll = cor_cur2, cor_nonZero = cor_cur3) |
|
140 |
+ } |
|
141 |
+ |
|
142 |
+ res.df$cor_all= as.numeric(res.df$cor_all) |
|
143 |
+ res.df$cor_nonZeroAll= as.numeric(res.df$cor_nonZeroAll) |
|
144 |
+ res.df$cor_nonZero= as.numeric(res.df$cor_nonZero) |
|
145 |
+ |
|
146 |
+ cor(res.df$cor_all, res.df$cor_nonZeroAll) |
|
147 |
+ cor(res.df$cor_all, res.df$cor_nonZero) |
|
148 |
+ |
|
149 |
+ res.df = res.df %>% |
|
150 |
+ dplyr::mutate(diff_all_nonZeroAll = cor_all - cor_nonZeroAll, |
|
151 |
+ diff_all_nonZero = cor_all - cor_nonZero) |
|
152 |
+ |
|
153 |
+ plot(ggplot(res.df, aes(.data$diff_all_nonZeroAll)) + |
|
154 |
+ geom_histogram(binwidth = 0.05) + |
|
155 |
+ ggtitle(paste0("No. rows: ", nRowsReal)) + |
|
156 |
+ xlim(-1,1) + xlab("Only pairs for which both values are > 0") |
|
157 |
+ ) |
|
158 |
+ |
|
159 |
+ |
|
160 |
+ plot(ggplot(res.df, aes(.data$diff_all_nonZero)) + |
|
161 |
+ geom_histogram(binwidth = 0.05) + |
|
162 |
+ ggtitle(paste0("No. rows: ", nRowsReal)) + |
|
163 |
+ xlim(-1,1) + xlab("Only pairs for which at least one point is > 0") |
|
164 |
+ ) |
|
165 |
+ |
|
166 |
+ } |
|
167 |
+ |
|
168 |
+ dev.off() |
|
169 |
+ |
|
170 |
+ GRN |
|
171 |
+ |
|
172 |
+} |
... | ... |
@@ -7,13 +7,15 @@ |
7 | 7 |
addData( |
8 | 8 |
GRN, |
9 | 9 |
counts_peaks, |
10 |
- normalization_peaks = "DESeq_sizeFactor", |
|
10 |
+ normalization_peaks = "DESeq2_sizeFactor", |
|
11 | 11 |
idColumn_peaks = "peakID", |
12 | 12 |
counts_rna, |
13 | 13 |
normalization_rna = "quantile", |
14 | 14 |
idColumn_RNA = "ENSEMBL", |
15 | 15 |
sampleMetadata = NULL, |
16 |
+ additionalParams.l = list(), |
|
16 | 17 |
allowOverlappingPeaks = FALSE, |
18 |
+ keepOriginalReadCounts = FALSE, |
|
17 | 19 |
forceRerun = FALSE |
18 | 20 |
) |
19 | 21 |
} |
... | ... |
@@ -24,8 +26,9 @@ addData( |
24 | 26 |
In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_peaks} below. |
25 | 27 |
Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.} |
26 | 28 |
|
27 |
-\item{normalization_peaks}{Character. Default \code{DESeq_sizeFactor}. |
|
28 |
-Normalization procedure for peak data. Must be one of \code{DESeq_sizeFactor}, \code{none}, or \code{quantile}.} |
|
29 |
+\item{normalization_peaks}{Character. Default \code{DESeq2_sizeFactor}. Normalization procedure for peak data. |
|
30 |
+Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, |
|
31 |
+\code{EDASeq_GC_peaks}, \code{gcqn_peaks}, \code{DESeq2_sizeFactors}, \code{none}.} |
|
29 | 32 |
|
30 | 33 |
\item{idColumn_peaks}{Character. Default \code{peakID}. Name of the column in the counts_peaks data frame that contains peak IDs. |
31 | 34 |
The required format must be {chr}:{start}-{end}", with {chr} denoting the abbreviated chromosome name, and {start} and {end} the begin and end |
... | ... |
@@ -36,16 +39,25 @@ In addition to the count data, it must also contain one ID column with a particu |
36 | 39 |
Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.} |
37 | 40 |
|
38 | 41 |
\item{normalization_rna}{Character. Default \code{quantile}. Normalization procedure for peak data. |
39 |
-Must be one of "DESeq_sizeFactor", "none", or "quantile". For "quantile", \code{limma::normalizeQuantiles} is used for normalization.} |
|
42 |
+Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, \code{DESeq2_sizeFactors}, \code{none}.} |
|
40 | 43 |
|
41 | 44 |
\item{idColumn_RNA}{Character. Default \code{ENSEMBL}. Name of the column in the \code{counts_rna} data frame that contains Ensembl IDs.} |
42 | 45 |
|
43 | 46 |
\item{sampleMetadata}{Data frame. Default \code{NULL}. Optional, additional metadata for the samples, such as age, sex, gender etc. |
44 | 47 |
If provided, the @seealso [plotPCA_all()] function can then incorporate and plot it. Sample names must match with those from both peak and RNA-Seq data. The first column is expected to contain the sample IDs, the actual column name is irrelevant.} |
45 | 48 |
|
49 |
+\item{additionalParams.l}{Named list. Default \code{list()}. Additional parameters for the chosen normalization method. |
|
50 |
+Currently, only the GC-aware normalization methods \code{EDASeq_GC_peaks} and \code{gcqn_peaks} are supported here. |
|
51 |
+Both support the parameters \code{roundResults} (logical flag, \code{TRUE} or \code{FALSE}) and \code{nBins} (Integer > 0), and \code{EDASeq_GC_peaks} supports three additional parameters: |
|
52 |
+\code{withinLane_method} (one of: "loess","median","upper","full") and \code{betweenLane_method} (one of: "median","upper","full"). |
|
53 |
+For more information, see the EDASeq vignette.} |
|
54 |
+ |
|
46 | 55 |
\item{allowOverlappingPeaks}{\code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should overlapping peaks be allowed (then only a warning is issued |
47 | 56 |
when overlapping peaks are found) or (the default) should an error be raised?} |
48 | 57 |
|
58 |
+\item{keepOriginalReadCounts}{\code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should the original read counts as provided to the function be kept in addition to |
|
59 |
+storing the rad counts after a (if any) normalization? This increases the memory footprint of the object because 2 additional count matrices have to be stored.} |
|
60 |
+ |
|
49 | 61 |
\item{forceRerun}{\code{TRUE} or \code{FALSE}. Default \code{FALSE}. Force execution, even if the GRN object already contains the result. Overwrites the old results.} |
50 | 62 |
} |
51 | 63 |
\value{ |
... | ... |
@@ -58,6 +70,9 @@ In addition, and highly recommended, sample metadata can be optionally provided. |
58 | 70 |
\details{ |
59 | 71 |
If the \code{ChIPseeker} package is installed, additional peak annotation is provided in the annotation slot and a peak annotation QC plot is produced as part of peak-gene QC. |
60 | 72 |
This is fully optional, however, and has no consequences for downstream functions. |
73 |
+Normalizing the data sensibly is very important. When \code{quantile}is chose, \code{limma::normalizeQuantiles} is used, which in essence does the following: |
|
74 |
+Each quantile of each column is set to the mean of that quantile across arrays. The intention is to make all the normalized columns have the same empirical distribution. |
|
75 |
+This will be exactly true if there are no missing values and no ties within the columns: the normalized columns are then simply permutations of one another. |
|
61 | 76 |
} |
62 | 77 |
\examples{ |
63 | 78 |
# See the Workflow vignette on the GRaNIE website for examples |
... | ... |
@@ -11,7 +11,7 @@ filterData( |
11 | 11 |
minNormalizedMeanRNA = 1, |
12 | 12 |
maxNormalizedMeanRNA = NULL, |
13 | 13 |
chrToKeep_peaks = NULL, |
14 |
- minSize_peaks = NULL, |
|
14 |
+ minSize_peaks = 20, |
|
15 | 15 |
maxSize_peaks = 10000, |
16 | 16 |
minCV_peaks = NULL, |
17 | 17 |
maxCV_peaks = NULL, |
... | ... |
@@ -33,7 +33,7 @@ filterData( |
33 | 33 |
|
34 | 34 |
\item{chrToKeep_peaks}{Character vector or \code{NULL}. Default \code{NULL}. Vector of chromosomes that peaks are allowed to come from. This filter can be used to filter sex chromosomes from the peaks, for example (e.g, \code{c(paste0("chr", 1:22), "chrX", "chrY")})} |
35 | 35 |
|
36 |
-\item{minSize_peaks}{Integer[1,] or \code{NULL}. Default \code{NULL}. Minimum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter.} |
|
36 |
+\item{minSize_peaks}{Integer[1,] or \code{NULL}. Default 20. Minimum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter.} |
|
37 | 37 |
|
38 | 38 |
\item{maxSize_peaks}{Integer[1,] or \code{NULL}. Default 10000. Maximum peak size (width, end - start) for a peak to be retained. Set to \code{NULL} for not applying the filter.} |
39 | 39 |
|
... | ... |
@@ -55,7 +55,7 @@ Overall, we tried to minimize the installation burden and only require packages |
55 | 55 |
|
56 | 56 |
1. Use `BiocManager::install("GRaNIE", dependencies = TRUE)` which however installs all annotation packages for all supported genomes, which may take a longer time due to the overall download size of multiple Gb. |
57 | 57 |
2. Install only the required annotation packages, along with all other optional packages. You may use the following for it: |
58 |
- - Genome annotation packages (one of the four is required to run *GRaNIE*, which of them depends on your genome assembly version): |
|
58 |
+ - Genome annotation packages (only one of the four is optionally needed for some functions, see section below, which of them depends on your genome assembly version): |
|
59 | 59 |
|
60 | 60 |
- hg38: `BiocManager::install(c("org.Hs.eg.db", "BSgenome.Hsapiens.UCSC.hg38", "TxDb.Hsapiens.UCSC.hg38.knownGene"))` |
61 | 61 |
- hg19: `BiocManager::install(c("org.Hs.eg.db", "BSgenome.Hsapiens.UCSC.hg19", "TxDb.Hsapiens.UCSC.hg19.knownGene"))` |
... | ... |
@@ -69,7 +69,9 @@ Overall, we tried to minimize the installation burden and only require packages |
69 | 69 |
|
70 | 70 |
*GRaNIE* has a number of packages that enhance the functionality for particular cases, may be required depending on certain parameters, only when using a particular functionality or that generally speed-up the computation. In detail, the following packages are currently optional and may be needed context-specifically: |
71 | 71 |
|
72 |
-- **Genome assembly and annotation packages** (one of the four is required to run *GRaNIE*, which of them depends on your genome assembly version) |
|
72 |
+- **Genome assembly and annotation packages** (only one of the four is optionally needed, which of them depends on your genome assembly version) |
|
73 |
+ |
|
74 |
+ - all of the following packages are ONLY needed for either additional peak annotation in combination with `ChIPseeker` (see below) or if the chosen peak normalization method is GC based: `EDASeq_GC_peaks`, `gcqn_peaks` |
|
73 | 75 |
- `org.Hs.eg.db`: Needed only when genome assembly version is `hg19` or `hg38` |
74 | 76 |
- `org.Mm.eg.db`: Needed only when genome assembly version is `mm9` or `mm10` |
75 | 77 |
- `BSgenome.Hsapiens.UCSC.hg19`, `TxDb.Hsapiens.UCSC.hg19.knownGene`: Needed only when genome assembly version is `hg19` |