... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
Package: GRaNIE |
2 | 2 |
Title: GRaNIE: Reconstruction cell type specific gene regulatory networks including enhancers using chromatin accessibility and RNA-seq data |
3 |
-Version: 1.1.17 |
|
3 |
+Version: 1.1.19 |
|
4 | 4 |
Encoding: UTF-8 |
5 | 5 |
Authors@R: c(person("Christian", "Arnold", email = |
6 | 6 |
"chrarnold@web.de", role = c("cre","aut")), |
... | ... |
@@ -1,4 +1,15 @@ |
1 |
+# GRaNIE 1.1.14 to 1.1.19 (2022-12-13) |
|
1 | 2 |
|
3 |
+## Major changes |
|
4 |
+- major object changes and optimizations, particularly related to storing the count matrices in an optimized and simpler format. In short, the count matrices are now stored either as normal or sparse matrices, depending on the amount of zeros present. In addition, only the counts after normalization are saved, the raw counts before applying normalization are not stored anymore. If no normalization is wished by the user, as before, the "normalized" counts are equal to the raw counts. `GRaNIE` is now more readily applicable for larger analyses and single-cell analysis even though we just started actively optimizing for it, so we cannot yet recommend applying our framework in a single-cell manner. Older GRN objects are automatically changed internally when executing the major functions upon the first invocation. |
|
5 |
+- various Documentation and R help updates |
|
6 |
+- the function `generateStatsSummary` now doesnt alter the stored filtered connections in the object anymore. This makes its usage more intuitive and it can be used anywhere in the workflow. |
|
7 |
+- removed redundant `biomaRt` calls in the code. This saves time and makes the code less vulnerable to timeout issues caused by remote services |
|
8 |
+- due to the changes described above, the function `plotPCA_all` now can only plot the normalized counts and not the raw counts anymore (except when no normalization is wanted) |
|
9 |
+- the GO enrichments are now also storing, for each GO term, the ENSEMBL IDs of the genes that were found in the foreground. This facilitates further exploration of the enrichment results. |
|
10 |
+ |
|
11 |
+## Minor changes |
|
12 |
+- many small changes in the code |
|
2 | 13 |
|
3 | 14 |
# GRaNIE 1.1.12 and 1.1.13 (2022-09-13) |
4 | 15 |
|
... | ... |
@@ -9,27 +9,26 @@ |
9 | 9 |
#' @section Accessors: |
10 | 10 |
#' In the following code snippets, \code{GRN} is a \code{\linkS4class{GRN}} object. |
11 | 11 |
#' |
12 |
-#'# Get general annotation of a GRaNIE object |
|
12 |
+#'# Get general annotation of a GRN object from the GRaNIE package |
|
13 | 13 |
#' |
14 |
-#' \code{nPeaks(GRN))} and \code{nGenes(GRN))}: Retrieve the number of peaks and genes, respectively, that have been added to the object (both before and after filtering) |
|
14 |
+#' \code{nPeaks(GRN))}, \code{nTFs(GRN))} and \code{nGenes(GRN))}: Retrieve the number of peaks, TFs and genes, respectively, that have been added to the object (both before and after filtering) |
|
15 | 15 |
#' |
16 | 16 |
#' @slot data Currently stores 4 different types of data:\cr |
17 | 17 |
#' \itemize{ |
18 | 18 |
#' \item \code{peaks}: |
19 | 19 |
#' \itemize{ |
20 |
-#' \item \code{counts_norm}: |
|
21 |
-#' \item \code{counts_orig}: |
|
22 |
-#' \item \code{consensusPeaks}: |
|
20 |
+#' \item \code{counts}: |
|
21 |
+#' \item \code{counts_metadata}: |
|
23 | 22 |
#' } |
24 | 23 |
#' \item \code{RNA}: |
25 | 24 |
#' \itemize{ |
26 |
-#' \item \code{counts_norm.l}: |
|
27 |
-#' \item \code{counts_orig}: |
|
25 |
+#' \item \code{counts}: |
|
26 |
+#' \item \code{counts_metadata}: |
|
27 |
+#' \item \code{counts_permuted_index}: |
|
28 | 28 |
#' } |
29 |
-#' \item \code{metadata}: |
|
30 | 29 |
#' \item \code{TFs}: |
31 | 30 |
#' \itemize{ |
32 |
-#' \item \code{translationTable}: |
|
31 |
+#' \item \code{TF_activity}: |
|
33 | 32 |
#' \item \code{TF_peak_overlap}: |
34 | 33 |
#' \item \code{classification}: |
35 | 34 |
#' } |
... | ... |
@@ -43,9 +42,8 @@ |
43 | 42 |
#' |
44 | 43 |
#' @slot stats Stores statistical and summary information for a \code{GRN} network. Currently, connection details are stored here. |
45 | 44 |
#' |
46 |
-#' @slot visualization Stores visualization results, currently always empty. Feature in development. |
|
47 |
-#' @slot isDev Flag whether this is an object from the development version of the package |
|
48 |
- |
|
45 |
+#' @slot graph Stores the eGRN graph related information and data structures |
|
46 |
+#' |
|
49 | 47 |
#' @keywords GRN-class, GRN |
50 | 48 |
#' @name GRN-class |
51 | 49 |
#' @aliases GRN-class |
... | ... |
@@ -56,9 +54,8 @@ setClass("GRN", representation = representation( |
56 | 54 |
connections = "list", |
57 | 55 |
data = "list", |
58 | 56 |
stats = "list", |
59 |
- visualization = "list", |
|
60 | 57 |
graph = "list", |
61 |
- isDev = "logical" |
|
58 |
+ visualization = "list" |
|
62 | 59 |
) |
63 | 60 |
) |
64 | 61 |
|
... | ... |
@@ -116,7 +113,7 @@ setMethod("show", |
116 | 113 |
#nGenes = nGenes(GRN) |
117 | 114 |
|
118 | 115 |
cat("Data summary:\n") |
119 |
- if (!is.null(GRN@data$peaks$consensusPeaks)) { |
|
116 |
+ if (!is.null(GRN@data$peaks$counts_metadata)) { |
|
120 | 117 |
nPeaks_filt = nPeaks(GRN, filter = TRUE) |
121 | 118 |
nPeaks_all = nPeaks(GRN, filter = FALSE) |
122 | 119 |
cat(" # peaks (filtered, all): ", nPeaks_filt, ", ", nPeaks_all, "\n", sep = "") |
... | ... |
@@ -124,25 +121,20 @@ setMethod("show", |
124 | 121 |
cat (" # peaks: No peak data found.\n") |
125 | 122 |
} |
126 | 123 |
|
127 |
- if (!is.null(GRN@data$RNA$counts_orig)) { |
|
124 |
+ if (!is.null(GRN@data$RNA$counts_metadata)) { |
|
128 | 125 |
|
129 | 126 |
nGenes_filt = nGenes(GRN, filter = TRUE) |
130 | 127 |
nGenes_all = nGenes(GRN, filter = FALSE) |
131 |
- |
|
132 |
- if (checkmate::testClass(GRN@data$RNA$counts_orig, "DESeqDataSet")) { |
|
133 |
- |
|
134 |
- } else { |
|
135 |
- |
|
136 |
- } |
|
137 |
- |
|
128 |
+ |
|
138 | 129 |
cat(" # genes (filtered, all): ", nGenes_filt, ", ", nGenes_all, "\n", sep = "") |
130 |
+ |
|
139 | 131 |
} else { |
140 | 132 |
cat (" # genes: No RNA-seq data found.\n") |
141 | 133 |
} |
142 | 134 |
|
143 |
- if (!is.null(GRN@data$RNA$counts_orig) & !is.null(GRN@data$peaks$consensusPeaks)) { |
|
135 |
+ if (!is.null(GRN@data$RNA$counts_metadata) & !is.null(GRN@data$peaks$counts_metadata)) { |
|
144 | 136 |
|
145 |
- cat(" # shared samples: ", nrow(GRN@data$metadata), "\n", sep = "") |
|
137 |
+ cat(" # shared samples: ", length(GRN@config$sharedSamples), "\n", sep = "") |
|
146 | 138 |
} |
147 | 139 |
|
148 | 140 |
if (!is.null(GRN@annotation$TFs)) { |
... | ... |
@@ -223,14 +215,14 @@ setMethod("show", |
223 | 215 |
} |
224 | 216 |
|
225 | 217 |
cat(" Enrichment (TF-gene):\n") |
226 |
- if (!is.null(GRN@stats$Enrichment$byTF)) { |
|
218 |
+ if (!is.null(GRN@stats$Enrichment$general)) { |
|
227 | 219 |
ontologies = names(GRN@stats$Enrichment$general) |
228 | 220 |
cat(" Overall network: ", paste0(ontologies, collapse = " & "), "\n",sep = "") |
229 | 221 |
} else { |
230 | 222 |
cat(" Overall network: no enrichment results found\n") |
231 | 223 |
} |
232 | 224 |
|
233 |
- if (!is.null(GRN@stats$Enrichment$byTF)) { |
|
225 |
+ if (!is.null(GRN@stats$Enrichment$byCommunity)) { |
|
234 | 226 |
ontologies = names(GRN@stats$Enrichment$byCommunity[[1]]) |
235 | 227 |
cat(" Communities: ", paste0(ontologies, collapse = " & "), "\n",sep = "") |
236 | 228 |
} else { |
... | ... |
@@ -274,14 +266,14 @@ nPeaks <- function(GRN, filter = TRUE) { |
274 | 266 |
checkmate::assertClass(GRN, "GRN") |
275 | 267 |
checkmate::assertFlag(filter) |
276 | 268 |
|
277 |
- if (is.null(GRN@data$peaks$consensusPeaks)) { |
|
269 |
+ if (is.null(GRN@data$peaks$counts_metadata)) { |
|
278 | 270 |
return(NA) |
279 | 271 |
} |
280 | 272 |
|
281 | 273 |
if (filter) { |
282 |
- nPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!.data$isFiltered) %>% nrow() |
|
274 |
+ nPeaks = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered) %>% nrow() |
|
283 | 275 |
} else { |
284 |
- nPeaks = GRN@data$peaks$consensusPeaks %>% nrow() |
|
276 |
+ nPeaks = GRN@data$peaks$counts_metadata %>% nrow() |
|
285 | 277 |
} |
286 | 278 |
|
287 | 279 |
nPeaks |
... | ... |
@@ -310,14 +302,14 @@ nGenes <- function(GRN, filter = TRUE) { |
310 | 302 |
checkmate::assertClass(GRN, "GRN") |
311 | 303 |
checkmate::assertFlag(filter) |
312 | 304 |
|
313 |
- if (is.null(GRN@data$RNA$counts_norm.l)) { |
|
305 |
+ if (is.null(GRN@data$RNA$counts_metadata)) { |
|
314 | 306 |
return(NA) |
315 | 307 |
} |
316 | 308 |
|
317 | 309 |
if (filter) { |
318 |
- nGenes = GRN@data$RNA$counts_norm.l[["0"]] %>% dplyr::filter(!.data$isFiltered) %>% nrow() |
|
310 |
+ nGenes = GRN@data$RNA$counts_metadata %>% dplyr::filter(!.data$isFiltered) %>% nrow() |
|
319 | 311 |
} else { |
320 |
- nGenes = GRN@data$RNA$counts_norm.l[["0"]] %>% nrow() |
|
312 |
+ nGenes = GRN@data$RNA$counts_metadata %>% nrow() |
|
321 | 313 |
} |
322 | 314 |
|
323 | 315 |
nGenes |
... | ... |
@@ -344,10 +336,10 @@ nTFs <- function(GRN) { |
344 | 336 |
|
345 | 337 |
checkmate::assertClass(GRN, "GRN") |
346 | 338 |
|
347 |
- if (is.null(GRN@data$TFs$translationTable)) { |
|
339 |
+ if (is.null(GRN@annotation$TFs)) { |
|
348 | 340 |
return(NA) |
349 | 341 |
} |
350 | 342 |
|
351 |
- nrow(GRN@data$TFs$translationTable) |
|
343 |
+ nrow(GRN@annotation$TFs) |
|
352 | 344 |
|
353 | 345 |
} |
354 | 346 |
\ No newline at end of file |
... | ... |
@@ -166,6 +166,8 @@ initializeGRN <- function(objectMetadata = list(), |
166 | 166 |
#' # We omit sampleMetadata = meta.df in the following line, becomes too long otherwise |
167 | 167 |
#' # GRN = addData(GRN, counts_peaks = peaks.df, counts_rna = rna.df, forceRerun = FALSE) |
168 | 168 |
|
169 |
+# TODO: add a isSingleCell argument or something similar |
|
170 |
+ |
|
169 | 171 |
addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", idColumn_peaks = "peakID", |
170 | 172 |
counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL, |
171 | 173 |
allowOverlappingPeaks= FALSE, |
... | ... |
@@ -185,14 +187,12 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
185 | 187 |
checkmate::assertFlag(allowOverlappingPeaks) |
186 | 188 |
checkmate::assertFlag(forceRerun) |
187 | 189 |
|
188 |
- if (is.null(GRN@data$peaks$counts_norm) | |
|
189 |
- is.null(GRN@data$RNA$counts_norm.l) | |
|
190 |
+ if (is.null(GRN@data$peaks$counts) | |
|
191 |
+ is.null(GRN@data$peaks$counts_metadata) | |
|
192 |
+ is.null(GRN@data$RNA$counts) | |
|
193 |
+ is.null(GRN@data$RNA$counts_metadata) | |
|
190 | 194 |
forceRerun) { |
191 |
- |
|
192 |
- # Store raw peaks counts as DESeq object |
|
193 |
- |
|
194 |
- #GRN@data$peaks$counts_raw = counts_peaks %>% dplyr::mutate(isFiltered = FALSE) |
|
195 |
- |
|
195 |
+ |
|
196 | 196 |
# Normalize ID column names |
197 | 197 |
if (idColumn_peaks != "peakID") { |
198 | 198 |
counts_peaks = dplyr::rename(counts_peaks, peakID = !!(idColumn_peaks)) |
... | ... |
@@ -269,16 +269,8 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
269 | 269 |
|
270 | 270 |
# Subset data to retain only samples that appear in both RNA and peaks |
271 | 271 |
data.l = .intersectData(countsRNA.norm.df, countsPeaks.norm.df) |
272 |
- GRN@data$peaks$counts_norm = data.l[["peaks"]] %>% dplyr::mutate(isFiltered = FALSE) |
|
273 |
- countsRNA.norm.df = data.l[["RNA"]] %>% dplyr::mutate(isFiltered = FALSE) |
|
274 |
- |
|
275 |
- futile.logger::flog.info(paste0( "Final dimensions of data:")) |
|
276 |
- futile.logger::flog.info(paste0( " RNA : ", nrow(countsRNA.norm.df) , " x ", ncol(countsRNA.norm.df) - 2, " (rows x columns)")) |
|
277 |
- futile.logger::flog.info(paste0( " peaks: ", nrow(countsPeaks.norm.df), " x ", ncol(countsPeaks.norm.df) - 1, " (rows x columns)")) |
|
278 |
- # Create permutations for RNA |
|
279 |
- futile.logger::flog.info(paste0( "Generate ", .getMaxPermutation(GRN), " permutations of RNA-counts")) |
|
280 |
- GRN@data$RNA$counts_norm.l = .shuffleColumns(countsRNA.norm.df, .getMaxPermutation(GRN), returnUnshuffled = TRUE, returnAsList = TRUE) |
|
281 | 272 |
|
273 |
+ # Generate metadata first to determine the nmberof shared samples etc |
|
282 | 274 |
if (!is.null(sampleMetadata)) { |
283 | 275 |
|
284 | 276 |
futile.logger::flog.info("Parsing provided metadata...") |
... | ... |
@@ -317,72 +309,35 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
317 | 309 |
|
318 | 310 |
GRN@config$sharedSamples = dplyr::filter(GRN@data$metadata, .data$has_both) %>% dplyr::pull(.data$sampleID) %>% as.character() |
319 | 311 |
|
320 |
- counts_peaks = as.data.frame(counts_peaks) |
|
321 |
- counts_rna = as.data.frame(counts_rna) |
|
322 |
- rownames(counts_peaks) = counts_peaks$peakID |
|
323 |
- rownames(counts_rna) = counts_rna$ENSEMBL |
|
324 |
- |
|
312 |
+ ### COUNT MATRICES |
|
313 |
+ # Store the matrices either as normal or sparse matrix |
|
325 | 314 |
|
326 |
- futile.logger::flog.info("Storing count matrices...") |
|
327 |
- # Store the raw peaks data efficiently as DESeq object only if it contains only integers, otherwise store as normal matrix |
|
315 |
+ GRN@data$peaks$counts = .storeAsMatrixOrSparseMatrix(GRN, df = data.l[["peaks"]], ID_column = "peakID", slotName = "GRN@data$peaks$counts") |
|
316 |
+ |
|
317 |
+ # includeFiltered = TRUE here as it doesnt make a difference and because getCounts requires counts_metadata$isFiltered to be set already |
|
318 |
+ GRN@data$peaks$counts_metadata = .createConsensusPeaksDF(getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = FALSE, includeFiltered = TRUE)) |
|
319 |
+ stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata)) |
|
328 | 320 |
|
329 |
- if (isIntegerMatrix(counts_peaks[, GRN@config$sharedSamples])) { |
|
330 |
- GRN@data$peaks$counts_orig = DESeq2::DESeqDataSetFromMatrix(counts_peaks[, GRN@config$sharedSamples], |
|
331 |
- colData = dplyr::filter(GRN@data$metadata, .data$has_both) %>% tibble::column_to_rownames("sampleID"), |
|
332 |
- design = ~1) |
|
333 |
- } else { |
|
334 |
- |
|
335 |
- # Store as sparse matrix if enough 0s |
|
336 |
- rowSample = min(100, nrow(counts_peaks)) |
|
337 |
- zeroEntries = length(which(counts_peaks[1:rowSample, GRN@config$sharedSamples] %>% unlist(use.names = FALSE) == 0)) |
|
338 |
- fractionZero = zeroEntries / (rowSample * length(GRN@config$sharedSamples)) |
|
339 |
- if (fractionZero> 0.1) { |
|
340 |
- futile.logger::flog.info(paste0("Storing GRN@data$peaks$counts_orig matrix as sparse matrix because fraction of 0s is > 0.1 (", fractionZero, ")")) |
|
341 |
- GRN@data$peaks$counts_orig = .asSparseMatrix(as.matrix(counts_peaks %>% dplyr::select(-.data$peakID)), |
|
342 |
- convertNA_to_zero = FALSE, |
|
343 |
- dimnames = list(counts_peaks$peakID, colnames(GRN@config$sharedSamples))) |
|
344 |
- } else { |
|
345 |
- GRN@data$peaks$counts_orig = as.matrix(counts_peaks[, GRN@config$sharedSamples]) |
|
346 |
- } |
|
321 |
+ GRN@data$RNA$counts = .storeAsMatrixOrSparseMatrix(GRN, df = data.l[["RNA"]], ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts") |
|
347 | 322 |
|
348 |
- |
|
349 |
- } |
|
323 |
+ GRN@data$RNA$counts_metadata = tibble::tibble(ID = data.l[["RNA"]]$ENSEMBL, isFiltered = FALSE) |
|
350 | 324 |
|
351 |
- if (isIntegerMatrix(counts_rna[, GRN@config$sharedSamples])) { |
|
352 |
- GRN@data$RNA$counts_orig = DESeq2::DESeqDataSetFromMatrix(counts_rna[, GRN@config$sharedSamples], |
|
353 |
- colData = dplyr::filter(GRN@data$metadata, .data$has_both) %>% tibble::column_to_rownames("sampleID"), |
|
354 |
- design = ~1) |
|
355 |
- } else { |
|
356 |
- |
|
357 |
- |
|
358 |
- # Store as sparse matrix if enough 0s |
|
359 |
- rowSample = min(100, nrow(counts_rna)) |
|
360 |
- zeroEntries = length(which(counts_rna[1:rowSample, GRN@config$sharedSamples] %>% unlist(use.names = FALSE) == 0)) |
|
361 |
- fractionZero = zeroEntries / (rowSample * length(GRN@config$sharedSamples)) |
|
362 |
- if (fractionZero> 0.1) { |
|
363 |
- futile.logger::flog.info(paste0("Storing GRN@data$RNA$counts_orig matrix as sparse matrix because fraction of 0s is > 0.1 (", fractionZero, ")")) |
|
364 |
- GRN@data$RNA$counts_orig = .asSparseMatrix(as.matrix(counts_rna %>% dplyr::select(-.data$ENSEMBL)), |
|
365 |
- convertNA_to_zero = FALSE, |
|
366 |
- dimnames = list(counts_rna$ENSEMBL, colnames(GRN@config$sharedSamples))) |
|
367 |
- } else { |
|
368 |
- GRN@data$RNA$counts_orig = as.matrix(counts_rna[, GRN@config$sharedSamples]) |
|
369 |
- } |
|
370 |
- } |
|
325 |
+ GRN@data$RNA$counts_permuted_index = .shuffleColumns(countsRNA.norm.df %>% dplyr::select(-.data$ENSEMBL), .getMaxPermutation(GRN), returnUnshuffled = FALSE, returnAsList = FALSE) |
|
371 | 326 |
|
372 |
- futile.logger::flog.info(paste0("Creating consensus peaks and check for overlapping peaks...")) |
|
373 |
- |
|
374 |
- # Consensus peaks |
|
375 |
- GRN@data$peaks$consensusPeaks = .createConsensusPeaksDF(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)) |
|
376 |
- GRN@data$peaks$consensusPeaks = GRN@data$peaks$consensusPeaks[match(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID, GRN@data$peaks$consensusPeaks$peakID), ] |
|
377 |
- stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$consensusPeaks)) |
|
327 |
+ futile.logger::flog.info(paste0( "Final dimensions of data:")) |
|
328 |
+ futile.logger::flog.info(paste0( " RNA : ", nrow(countsRNA.norm.df) , " x ", ncol(countsRNA.norm.df) - 2, " (rows x columns)")) |
|
329 |
+ futile.logger::flog.info(paste0( " peaks: ", nrow(countsPeaks.norm.df), " x ", ncol(countsPeaks.norm.df) - 1, " (rows x columns)")) |
|
330 |
+ # Create permutations for RNA |
|
331 |
+ futile.logger::flog.info(paste0( "Generate ", .getMaxPermutation(GRN), " permutations of RNA-counts")) |
|
378 | 332 |
|
379 | 333 |
|
334 |
+ futile.logger::flog.info(paste0("Creating consensus peaks and check for overlapping peaks...")) |
|
380 | 335 |
|
336 |
+ # Consensus peaks |
|
337 |
+ |
|
381 | 338 |
# Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details |
382 |
- consensus.gr = .constructGRanges(GRN@data$peaks$consensusPeaks, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE) |
|
383 |
- |
|
384 |
- |
|
385 |
- |
|
339 |
+ consensus.gr = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE) |
|
340 |
+ |
|
386 | 341 |
overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) >1) |
387 | 342 |
|
388 | 343 |
if (length(overlappingPeaks) > 0) { |
... | ... |
@@ -421,7 +376,31 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
421 | 376 |
|
422 | 377 |
} |
423 | 378 |
|
379 |
+.storeAsMatrixOrSparseMatrix <- function (GRN, df, ID_column, slotName, threshold = 0.1) { |
|
380 |
+ |
|
381 |
+ stopifnot(identical(GRN@config$sharedSamples, colnames(df)[-1])) |
|
382 |
+ |
|
383 |
+ # Store as sparse matrix if enough 0s |
|
384 |
+ checkmate::assertIntegerish(length(GRN@config$sharedSamples), lower = 1) |
|
424 | 385 |
|
386 |
+ df.m = df %>% |
|
387 |
+ dplyr::select(tidyselect::one_of(ID_column, GRN@config$sharedSamples)) %>% |
|
388 |
+ tibble::column_to_rownames(ID_column) %>% |
|
389 |
+ as.matrix() |
|
390 |
+ |
|
391 |
+ # Determine sparsity |
|
392 |
+ fractionZero = (length(df.m) - Matrix::nnzero(df.m)) / length(df.m) |
|
393 |
+ |
|
394 |
+ |
|
395 |
+ if (fractionZero > threshold) { |
|
396 |
+ futile.logger::flog.info(paste0("Storing ", slotName, " matrix as sparse matrix because fraction of 0s is > ", threshold, " (", fractionZero, ")")) |
|
397 |
+ df.m = .asSparseMatrix(df.m, convertNA_to_zero = FALSE, |
|
398 |
+ dimnames = list(df[, ID_column] %>% unlist(use.names = FALSE), GRN@config$sharedSamples)) |
|
399 |
+ } |
|
400 |
+ |
|
401 |
+ df.m |
|
402 |
+ |
|
403 |
+} |
|
425 | 404 |
|
426 | 405 |
.createConsensusPeaksDF <- function(countsPeaks, idColumn = "peakID") { |
427 | 406 |
|
... | ... |
@@ -439,11 +418,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
439 | 418 |
start = as.numeric(sapply(ids.split, "[[", 2)), |
440 | 419 |
end = as.numeric(sapply(ids.split, "[[", 3)), |
441 | 420 |
peakID = paste0(.data$chr, ":", .data$start, "-", .data$end), |
442 |
- isFiltered = FALSE) %>% |
|
443 |
- dplyr::arrange(.data$chr, .data$start) |
|
444 |
- |
|
445 |
- # The sorting is necessary here for subsequent bedtools to run faster or at all |
|
446 |
- |
|
421 |
+ isFiltered = FALSE) |
|
447 | 422 |
consensus.df |
448 | 423 |
} |
449 | 424 |
|
... | ... |
@@ -503,52 +478,23 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
503 | 478 |
|
504 | 479 |
} |
505 | 480 |
|
506 |
-.getAllGeneTypesAndFrequencies <- function(genomeAssembly, verbose = TRUE) { |
|
507 |
- |
|
508 |
- geneAnnotation.df = .retrieveAnnotationData(genomeAssembly) |
|
509 |
- return(table(geneAnnotation.df$gene.type)) |
|
510 |
- |
|
511 |
-} |
|
512 |
- |
|
513 |
-.getKnownGeneAnnotationNew <- function(GRN, gene.types, extendRegions = NULL) { |
|
514 |
- |
|
515 |
- checkmate::assertCharacter(GRN@config$parameters$genomeAssembly, len = 1) |
|
516 |
- |
|
517 |
- if (!is.null(extendRegions)) { |
|
518 |
- stop("Not yet implemented") |
|
519 |
- } |
|
520 |
- |
|
521 |
- genes.filt.df = .retrieveAnnotationData(GRN@config$parameters$genomeAssembly) |
|
522 |
- |
|
523 |
- if (!is.null(gene.types)) { |
|
524 |
- if (! "all" %in% gene.types) { |
|
525 |
- genes.filt.df = dplyr::filter(genes.filt.df, .data$gene.type %in% gene.types) |
|
526 |
- } |
|
527 |
- } |
|
528 |
- |
|
529 |
- |
|
530 |
- genes.filt.df |
|
531 |
- |
|
532 |
-} |
|
533 | 481 |
|
534 | 482 |
.populatePeakAnnotation <- function (GRN) { |
535 | 483 |
|
536 |
- countsPeaks.clean = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE) |
|
484 |
+ countsPeaks.clean = getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = TRUE, includeFiltered = TRUE) |
|
537 | 485 |
|
538 | 486 |
futile.logger::flog.info(paste0(" Calculate statistics for each peak (mean and CV)")) |
539 | 487 |
|
540 |
- countsPeaks.m = as.matrix(dplyr::select(countsPeaks.clean, -.data$peakID)) |
|
541 |
- |
|
542 |
- rowMeans_peaks = rowMeans(countsPeaks.m) |
|
543 |
- rowMedians_peaks = matrixStats::rowMedians(countsPeaks.m) |
|
544 |
- CV_peaks = matrixStats::rowSds(countsPeaks.m) / rowMeans_peaks |
|
488 |
+ rowMeans_peaks = rowMeans(countsPeaks.clean) |
|
489 |
+ rowMedians_peaks = matrixStats::rowMedians(countsPeaks.clean) |
|
490 |
+ CV_peaks = matrixStats::rowSds(countsPeaks.clean) / rowMeans_peaks |
|
545 | 491 |
|
546 |
- metadata_peaks = tibble::tibble(peak.ID = countsPeaks.clean$peakID, |
|
492 |
+ metadata_peaks = tibble::tibble(peak.ID = rownames(countsPeaks.clean), |
|
547 | 493 |
peak.mean = rowMeans_peaks, |
548 | 494 |
peak.median = rowMedians_peaks, |
549 | 495 |
peak.CV = CV_peaks) |
550 | 496 |
|
551 |
- GRN@annotation$consensusPeaks = metadata_peaks |
|
497 |
+ GRN@annotation$peaks = metadata_peaks |
|
552 | 498 |
|
553 | 499 |
if (!is.installed("ChIPseeker")) { |
554 | 500 |
packageMessage = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ", |
... | ... |
@@ -558,8 +504,8 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
558 | 504 |
|
559 | 505 |
futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while")) |
560 | 506 |
genomeAssembly = GRN@config$parameters$genomeAssembly |
561 |
- consensusPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!.data$isFiltered) |
|
562 |
- consensusPeaks.gr = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly) |
|
507 |
+ consensusPeaks = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered) |
|
508 |
+ consensusPeaks.gr = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly) |
|
563 | 509 |
|
564 | 510 |
# Add ChIPSeeker anotation |
565 | 511 |
peaks.annotated = suppressMessages(ChIPseeker::annotatePeak( |
... | ... |
@@ -579,13 +525,13 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
579 | 525 |
verbose = TRUE # the default |
580 | 526 |
)) |
581 | 527 |
|
582 |
- GRN@annotation$consensusPeaks_obj = peaks.annotated |
|
528 |
+ GRN@annotation$peaks_obj = peaks.annotated |
|
583 | 529 |
|
584 | 530 |
peaks.annotated.df = as.data.frame(peaks.annotated) |
585 | 531 |
peaks.annotated.df$annotation[grepl("Exon", peaks.annotated.df$annotation)] = "Exon" |
586 | 532 |
peaks.annotated.df$annotation[grepl("Intron", peaks.annotated.df$annotation)] = "Intron" |
587 | 533 |
|
588 |
- GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, |
|
534 |
+ GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, |
|
589 | 535 |
peaks.annotated.df %>% |
590 | 536 |
dplyr::select(.data$peakID, .data$annotation, tidyselect::starts_with("gene"), -.data$geneId, .data$distanceToTSS, .data$ENSEMBL, .data$SYMBOL, .data$GENENAME) %>% |
591 | 537 |
dplyr::mutate(annotation = as.factor(.data$annotation), |
... | ... |
@@ -621,11 +567,10 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
621 | 567 |
.populateGeneAnnotation <- function (GRN) { |
622 | 568 |
|
623 | 569 |
|
624 |
- countsRNA.clean = getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE) |
|
570 |
+ countsRNA.m = getCounts(GRN, type = "rna", permuted = FALSE, asMatrix = TRUE, includeFiltered = TRUE) |
|
625 | 571 |
|
626 |
- futile.logger::flog.info(paste0(" Calculate statistics for each of the ", nrow(countsRNA.clean), " genes that were provided with the RNA-seq data (mean and CV)")) |
|
572 |
+ futile.logger::flog.info(paste0(" Calculate statistics for each of the ", nrow(countsRNA.m), " genes that were provided with the RNA-seq data (mean and CV)")) |
|
627 | 573 |
|
628 |
- countsRNA.m = as.matrix(dplyr::select(countsRNA.clean, -.data$ENSEMBL)) |
|
629 | 574 |
|
630 | 575 |
rowMeans_rna = rowMeans(countsRNA.m) |
631 | 576 |
rowMedians_rna = matrixStats::rowMedians(countsRNA.m) |
... | ... |
@@ -633,7 +578,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
633 | 578 |
|
634 | 579 |
genomeAnnotation.df = .retrieveAnnotationData(GRN@config$parameters$genomeAssembly) |
635 | 580 |
|
636 |
- metadata_rna = tibble::tibble(gene.ENSEMBL = countsRNA.clean$ENSEMBL, |
|
581 |
+ metadata_rna = tibble::tibble(gene.ENSEMBL = rownames(countsRNA.m), |
|
637 | 582 |
gene.mean = rowMeans_rna, |
638 | 583 |
gene.median = rowMedians_rna, |
639 | 584 |
gene.CV = CV_rna) %>% |
... | ... |
@@ -663,7 +608,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
663 | 608 |
genome = .getGenomeObject(genomeAssembly, type = "BSgenome") |
664 | 609 |
|
665 | 610 |
# Get peaks as GRanges object |
666 |
- query = .constructGRanges(GRN@data$peaks$consensusPeaks, |
|
611 |
+ query = .constructGRanges(GRN@data$peaks$counts_metadata, |
|
667 | 612 |
seqlengths = .getChrLengths(genomeAssembly), |
668 | 613 |
genomeAssembly) |
669 | 614 |
|
... | ... |
@@ -690,7 +635,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
690 | 635 |
|
691 | 636 |
#ggplot2::ggplot(GC_classes.df , ggplot2::aes(GC.class, n_rel)) + geom_bar(stat = "identity") + ggplot2::theme_bw() |
692 | 637 |
|
693 |
- GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, GC_content.df, by = "peak.ID") %>% |
|
638 |
+ GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC_content.df, by = "peak.ID") %>% |
|
694 | 639 |
dplyr::rename( peak.GC.perc = .data$`G|C`, |
695 | 640 |
peak.width = .data$length, |
696 | 641 |
peak.GC.class = .data$GC_class) |
... | ... |
@@ -708,7 +653,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor", |
708 | 653 |
#' This function marks genes and/or peaks as \code{filtered} depending on the chosen filtering criteria. Filtered genes / peaks will then be |
709 | 654 |
#' disregarded when adding connections in subsequent steps via \code{\link{addConnections_TF_peak}} and \code{\link{addConnections_peak_gene}}. \strong{This function does NOT (re)filter existing connections when the \code{\linkS4class{GRN}} object already contains connections. Thus, upon re-execution of this function with different filtering criteria, all downstream steps have to be re-run.} |
710 | 655 |
#' |
711 |
-#' All this function does is setting (or modifying) the filtering flag in \code{GRN@data$peaks$counts_norm} and \code{GRN@data$RNA$counts_norm.l}, respectively. |
|
656 |
+#' All this function does is setting (or modifying) the filtering flag in \code{GRN@data$peaks$counts_metadata} and \code{GRN@data$RNA$counts_metadata}, respectively. |
|
712 | 657 |
#' |
713 | 658 |
#' @template GRN |
714 | 659 |
#' @param minNormalizedMean_peaks Numeric[0,] or \code{NULL}. Default 5. Minimum mean across all samples for a peak to be retained for the normalized counts table. Set to \code{NULL} for not applying the filter. |
... | ... |
@@ -750,7 +695,7 @@ filterData <- function (GRN, |
750 | 695 |
checkmate::assertNumber(maxNormalizedMean_peaks, lower = minNormalizedMean_peaks , null.ok = TRUE) |
751 | 696 |
checkmate::assertNumber(maxNormalizedMeanRNA, lower = minNormalizedMeanRNA, null.ok = TRUE) |
752 | 697 |
checkmate::assertCharacter(chrToKeep_peaks, min.len = 1, any.missing = FALSE, null.ok = TRUE) |
753 |
- checkmate::assertSubset(chrToKeep_peaks, GRN@data$peaks$consensusPeaks %>% dplyr::pull(.data$chr) %>% unique() %>% as.character()) |
|
698 |
+ checkmate::assertSubset(chrToKeep_peaks, GRN@data$peaks$counts_metadata %>% dplyr::pull(.data$chr) %>% unique() %>% as.character()) |
|
754 | 699 |
|
755 | 700 |
checkmate::assertIntegerish(minSize_peaks, lower = 1, null.ok = TRUE) |
756 | 701 |
checkmate::assertIntegerish(maxSize_peaks, lower = dplyr::if_else(is.null(minSize_peaks), 1, minSize_peaks), null.ok = TRUE) |
... | ... |
@@ -762,8 +707,7 @@ filterData <- function (GRN, |
762 | 707 |
|
763 | 708 |
if (!GRN@config$isFiltered | forceRerun) { |
764 | 709 |
|
765 |
- GRN@data$peaks$consensusPeaks$isFiltered = FALSE |
|
766 |
- GRN@data$peaks$counts_norm$isFiltered = FALSE |
|
710 |
+ GRN@data$peaks$counts_metadata$isFiltered = FALSE |
|
767 | 711 |
|
768 | 712 |
if(!is.null(GRN@data$TFs$TF_peak_overlap)) { |
769 | 713 |
GRN@data$TFs$TF_peak_overlap[, "isFiltered"] = 0 |
... | ... |
@@ -782,7 +726,7 @@ filterData <- function (GRN, |
782 | 726 |
chrToKeep_peaks, |
783 | 727 |
minSize_peaks = minSize_peaks, maxSize_peaks = maxSize_peaks) |
784 | 728 |
|
785 |
- nPeaksBefore = nrow(GRN@data$peaks$consensusPeaks) |
|
729 |
+ nPeaksBefore = nrow(GRN@data$peaks$counts_metadata) |
|
786 | 730 |
peaks_toKeep = intersect(peakIDs.chr, peakIDs.CV) |
787 | 731 |
futile.logger::flog.info(paste0("Collectively, filter ", nPeaksBefore -length(peaks_toKeep), " out of ", nPeaksBefore, " peaks.")) |
788 | 732 |
futile.logger::flog.info(paste0("Number of remaining peaks: ", length(peaks_toKeep))) |
... | ... |
@@ -792,9 +736,9 @@ filterData <- function (GRN, |
792 | 736 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
793 | 737 |
} |
794 | 738 |
|
795 |
- GRN@data$peaks$consensusPeaks$isFiltered = ! GRN@data$peaks$consensusPeaks$peakID %in% peaks_toKeep |
|
739 |
+ GRN@data$peaks$counts_metadata$isFiltered = ! GRN@data$peaks$counts_metadata$peakID %in% peaks_toKeep |
|
796 | 740 |
#GRN@data$peaks$counts_raw$isFiltered = ! GRN@data$peaks$counts_raw$peakID %in% peaks_toKeep |
797 |
- GRN@data$peaks$counts_norm$isFiltered = ! GRN@data$peaks$counts_norm$peakID %in% peaks_toKeep |
|
741 |
+ GRN@data$peaks$counts_metadata$isFiltered = ! GRN@data$peaks$counts_metadata$peakID %in% peaks_toKeep |
|
798 | 742 |
|
799 | 743 |
|
800 | 744 |
if(!is.null(GRN@data$TFs$TF_peak_overlap)) { |
... | ... |
@@ -816,12 +760,10 @@ filterData <- function (GRN, |
816 | 760 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
817 | 761 |
} |
818 | 762 |
|
819 |
- for (permutationCur in c(0)) { |
|
820 |
- permIndex = as.character(permutationCur) |
|
821 |
- rowMeans = rowMeans(dplyr::select(GRN@data$RNA$counts_norm.l[[permIndex]], -.data$ENSEMBL)) |
|
822 |
- GRN@data$RNA$counts_norm.l[[permIndex]]$isFiltered = rowMeans < minNormalizedMeanRNA |
|
823 |
- } |
|
824 |
- nRowsFlagged = length(which(GRN@data$RNA$counts_norm.l[["0"]]$isFiltered)) |
|
763 |
+ rowMeans = rowMeans(getCounts(GRN, type = "rna", asMatrix = TRUE, includeFiltered = TRUE)) |
|
764 |
+ GRN@data$RNA$counts_metadata$isFiltered = rowMeans < minNormalizedMeanRNA |
|
765 |
+ |
|
766 |
+ nRowsFlagged = length(which(GRN@data$RNA$counts_metadata$isFiltered)) |
|
825 | 767 |
|
826 | 768 |
# Raw counts are left untouched and filtered where needed only |
827 | 769 |
futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows because the row mean was smaller than ", minNormalizedMeanRNA)) |
... | ... |
@@ -844,26 +786,17 @@ filterData <- function (GRN, |
844 | 786 |
} |
845 | 787 |
|
846 | 788 |
if (is.null(chrToKeep)) { |
847 |
- chrToKeep = GRN@data$peaks$consensusPeaks %>% dplyr::pull(.data$chr) %>% unique() |
|
789 |
+ chrToKeep = GRN@data$peaks$counts_metadata %>% dplyr::pull(.data$chr) %>% unique() |
|
848 | 790 |
} else { |
849 | 791 |
futile.logger::flog.info(paste0("Filter and sort peaks and remain only those on the following chromosomes: ", paste0(chrToKeep, collapse = ","))) |
850 | 792 |
} |
851 | 793 |
|
852 | 794 |
|
853 | 795 |
futile.logger::flog.info(paste0("Filter and sort peaks by size and remain only those smaller than : ", maxSize_peaks)) |
854 |
- futile.logger::flog.info(paste0(" Number of peaks before filtering: ", nrow(GRN@data$peaks$consensusPeaks))) |
|
855 |
- ids = strsplit(GRN@data$peaks$consensusPeaks %>% dplyr::pull(!!(idColumn)), split = ":", fixed = TRUE) |
|
856 |
- ids_chr = sapply(ids, "[[", 1) |
|
857 |
- ids_pos = sapply(ids, "[[", 2) |
|
858 |
- ids_pos = strsplit(ids_pos, split = "-", fixed = TRUE) |
|
859 |
- start = sapply(ids_pos, "[[", 1) |
|
860 |
- end = sapply(ids_pos, "[[", 2) |
|
796 |
+ futile.logger::flog.info(paste0(" Number of peaks before filtering: ", nrow(GRN@data$peaks$counts_metadata))) |
|
861 | 797 |
|
862 |
- countsPeaks.clean = GRN@data$peaks$consensusPeaks %>% |
|
863 |
- dplyr::mutate(chr = ids_chr, |
|
864 |
- start = as.numeric(start), |
|
865 |
- end = as.numeric(end), |
|
866 |
- size = end-start) %>% |
|
798 |
+ countsPeaks.clean = GRN@data$peaks$counts_metadata %>% |
|
799 |
+ dplyr::mutate(size = end-start) %>% |
|
867 | 800 |
dplyr::filter(.data$chr %in% chrToKeep, .data$size <= maxSize_peaks, .data$size >= minSize_peaks) %>% |
868 | 801 |
# arrange(chr, start) %>% |
869 | 802 |
dplyr::rename(peakID = !!(idColumn)) %>% |
... | ... |
@@ -883,12 +816,12 @@ filterData <- function (GRN, |
883 | 816 |
if (is.null(maxCV)) { |
884 | 817 |
futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV)) |
885 | 818 |
|
886 |
- peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, .data$peak.CV >= minCV) |
|
819 |
+ peaksFiltered = dplyr::filter(GRN@annotation$peaks, .data$peak.CV >= minCV) |
|
887 | 820 |
|
888 | 821 |
} else { |
889 | 822 |
futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV, ", max CV = ", maxCV)) |
890 | 823 |
|
891 |
- peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, .data$peak.CV >= minCV, .data$peak.CV <= maxCV) |
|
824 |
+ peaksFiltered = dplyr::filter(GRN@annotation$peaks, .data$peak.CV >= minCV, .data$peak.CV <= maxCV) |
|
892 | 825 |
} |
893 | 826 |
|
894 | 827 |
futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered))) |
... | ... |
@@ -902,7 +835,7 @@ filterData <- function (GRN, |
902 | 835 |
|
903 | 836 |
startTime = Sys.time() |
904 | 837 |
|
905 |
- futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$consensusPeaks))) |
|
838 |
+ futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$peaks))) |
|
906 | 839 |
|
907 | 840 |
if (is.null(minCV)) { |
908 | 841 |
minCV = 0 |
... | ... |
@@ -931,7 +864,7 @@ filterData <- function (GRN, |
931 | 864 |
} |
932 | 865 |
|
933 | 866 |
|
934 |
- peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, |
|
867 |
+ peaksFiltered = dplyr::filter(GRN@annotation$peaks, |
|
935 | 868 |
.data$peak.CV >= minCV, .data$peak.CV <= maxCV, |
936 | 869 |
.data$peak.mean >= minMean, .data$peak.mean <= maxMean) |
937 | 870 |
|
... | ... |
@@ -1026,7 +959,7 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte |
1026 | 959 |
|
1027 | 960 |
GRN@config$TFBS_fileEnding = fileEnding |
1028 | 961 |
GRN@config$TFBS_filePattern = filesTFBSPattern |
1029 |
- GRN@annotation$TFs = .getFinalListOfTFs(motifFolder, filesTFBSPattern, fileEnding, TFs, nTFMax, getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE)) |
|
962 |
+ GRN@annotation$TFs = .getFinalListOfTFs(motifFolder, filesTFBSPattern, fileEnding, TFs, nTFMax, getCounts(GRN, type = "rna", permuted = FALSE)) |
|
1030 | 963 |
|
1031 | 964 |
|
1032 | 965 |
GRN@annotation$TFs = GRN@annotation$TFs %>% |
... | ... |
@@ -1162,7 +1095,7 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) { |
1162 | 1095 |
|
1163 | 1096 |
|
1164 | 1097 |
# Check whether we have peaks on chromosomes not part of the sequence length reference. If yes, discard them |
1165 |
- annotation_discared = dplyr::filter(GRN@data$peaks$consensusPeaks, ! .data$chr %in% names(seqlengths)) |
|
1098 |
+ annotation_discared = dplyr::filter(GRN@data$peaks$counts_metadata, ! .data$chr %in% names(seqlengths)) |
|
1166 | 1099 |
|
1167 | 1100 |
if (nrow(annotation_discared) > 0) { |
1168 | 1101 |
|
... | ... |
@@ -1174,11 +1107,11 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) { |
1174 | 1107 |
paste0(names(tbl_discarded), " (", tbl_discarded, ")", collapse = ",")) |
1175 | 1108 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) |
1176 | 1109 |
|
1177 |
- GRN@data$peaks$consensusPeaks = dplyr::filter(GRN@data$peaks$consensusPeaks, .data$chr %in% names(seqlengths)) |
|
1110 |
+ GRN@data$peaks$counts_metadata = dplyr::filter(GRN@data$peaks$counts_metadata, .data$chr %in% names(seqlengths)) |
|
1178 | 1111 |
} |
1179 | 1112 |
|
1180 | 1113 |
# Construct GRanges |
1181 |
- consensus.gr = .constructGRanges(GRN@data$peaks$consensusPeaks, seqlengths = seqlengths, genomeAssembly) |
|
1114 |
+ consensus.gr = .constructGRanges(GRN@data$peaks$counts_metadata, seqlengths = seqlengths, genomeAssembly) |
|
1182 | 1115 |
|
1183 | 1116 |
res.l = .execInParallelGen(nCores, returnAsList = TRUE, listNames = GRN@config$allTF, |
1184 | 1117 |
iteration = seq_len(length(GRN@config$allTF)), |
... | ... |
@@ -1199,22 +1132,22 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) { |
1199 | 1132 |
} |
1200 | 1133 |
|
1201 | 1134 |
# new |
1202 |
- filteredPeaks = dplyr::filter(GRN@data$peaks$counts_norm, .data$isFiltered) %>% dplyr::pull(.data$peakID) |
|
1135 |
+ filteredPeaks = dplyr::filter(GRN@data$peaks$counts_metadata, .data$isFiltered) %>% dplyr::pull(.data$peakID) |
|
1203 | 1136 |
# Collect binary 0/1 binding matrix from all TF and concatenate |
1204 | 1137 |
GRN@data$TFs$TF_peak_overlap = TFBS_bindingMatrix.df %>% |
1205 |
- dplyr::mutate(peakID = GRN@data$peaks$consensusPeaks$peakID, |
|
1138 |
+ dplyr::mutate(peakID = GRN@data$peaks$counts_metadata$peakID, |
|
1206 | 1139 |
isFiltered = .data$peakID %in% filteredPeaks) %>% |
1207 | 1140 |
dplyr::mutate_if(is.logical, as.numeric) %>% |
1208 | 1141 |
dplyr::select(tidyselect::all_of(sort(GRN@config$allTF)), .data$isFiltered) |
1209 | 1142 |
|
1210 | 1143 |
GRN@data$TFs$TF_peak_overlap = .asSparseMatrix(as.matrix(GRN@data$TFs$TF_peak_overlap), |
1211 | 1144 |
convertNA_to_zero = FALSE, |
1212 |
- dimnames = list(GRN@data$peaks$consensusPeaks$peakID, colnames(GRN@data$TFs$TF_peak_overlap))) |
|
1145 |
+ dimnames = list(GRN@data$peaks$counts_metadata$peakID, colnames(GRN@data$TFs$TF_peak_overlap))) |
|
1213 | 1146 |
|
1214 | 1147 |
# The order of rows is here the sorted version as it originates from the sorted consensus peak file |
1215 | 1148 |
# We resort it to match the countsPeaks.norm |
1216 | 1149 |
# TODO: Here could be an error due to differences in sorting. Also, whether or not all or the filtered version shall be used |
1217 |
- stopifnot(identical(rownames(GRN@data$TFs$TF_peak_overlap), GRN@data$peaks$counts_norm$peakID)) |
|
1150 |
+ stopifnot(identical(rownames(GRN@data$TFs$TF_peak_overlap), GRN@data$peaks$counts_metadata$peakID)) |
|
1218 | 1151 |
|
1219 | 1152 |
} |
1220 | 1153 |
|
... | ... |
@@ -1274,7 +1207,7 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) { |
1274 | 1207 |
futile.logger::flog.info(paste0(" Calculating intersection for TF ", TFCur, " finished. Number of overlapping TFBS after filtering: ", nrow(final.df))) |
1275 | 1208 |
|
1276 | 1209 |
|
1277 |
- return(GRN@data$peaks$consensusPeaks$peakID %in% final.df$peakID) |
|
1210 |
+ return(GRN@data$peaks$counts_metadata$peakID %in% final.df$peakID) |
|
1278 | 1211 |
|
1279 | 1212 |
} |
1280 | 1213 |
|
... | ... |
@@ -1415,6 +1348,9 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac |
1415 | 1348 |
checkmate::assertCharacter(name, min.chars = 1, len = 1) |
1416 | 1349 |
checkmate::assertFlag(forceRerun) |
1417 | 1350 |
|
1351 |
+ message = paste0("This function is currently under development and testing and not fully functional yet.") |
|
1352 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
1353 |
+ |
|
1418 | 1354 |
forbiddenNames = "expression" |
1419 | 1355 |
.checkForbiddenNames(name, forbiddenNames) |
1420 | 1356 |
|
... | ... |
@@ -1424,7 +1360,8 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac |
1424 | 1360 |
|
1425 | 1361 |
|
1426 | 1362 |
# Input: Raw peak counts per TF and TF-binding matrix |
1427 |
- counts.df = getCounts(GRN, type = "peaks", norm = FALSE, permuted = FALSE) %>% |
|
1363 |
+ # TODO: How to add raw counts here |
|
1364 |
+ counts.df = getCounts(GRN, type = "peaks", permuted = FALSE) %>% |
|
1428 | 1365 |
tibble::as_tibble() |
1429 | 1366 |
|
1430 | 1367 |
# TODO replace by getter |
... | ... |
@@ -1722,7 +1659,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05 |
1722 | 1659 |
outputFolder = .checkOutputFolder(GRN, outputFolder) |
1723 | 1660 |
|
1724 | 1661 |
GRN@data$TFs$classification$TF.translation.orig = GRN@annotation$TFs %>% |
1725 |
- dplyr::mutate(TF.name = .data$HOCOID) |
|
1662 |
+ dplyr::mutate(TF.name = .data$TF.HOCOID) |
|
1726 | 1663 |
|
1727 | 1664 |
if (is.null(GRN@data$TFs$TF_peak_overlap)) { |
1728 | 1665 |
message = paste0("Could not find peak - TF matrix. Run the function overlapPeaksAndTFBS first / again") |
... | ... |
@@ -1763,7 +1700,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05 |
1763 | 1700 |
GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]] = list() |
1764 | 1701 |
|
1765 | 1702 |
if (connectionTypeCur == "expression") { |
1766 |
- counts1 = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur)) |
|
1703 |
+ counts1 = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur)) |
|
1767 | 1704 |
|
1768 | 1705 |
} else { |
1769 | 1706 |
|
... | ... |
@@ -1775,7 +1712,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05 |
1775 | 1712 |
|
1776 | 1713 |
futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts")) |
1777 | 1714 |
|
1778 |
- counts_peaks = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE) |
|
1715 |
+ counts_peaks = getCounts(GRN, type = "peaks", permuted = FALSE) |
|
1779 | 1716 |
|
1780 | 1717 |
TF_peak_cor = .correlateMatrices(matrix1 = counts1, |
1781 | 1718 |
matrix_peaks = counts_peaks, |
... | ... |
@@ -1792,6 +1729,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05 |
1792 | 1729 |
} |
1793 | 1730 |
|
1794 | 1731 |
# Final classification: Calculate thresholds by calculating the quantiles of the background and compare the real values to the background |
1732 |
+ # TODO: Clarify whether the default convertZero_to_NA=TRUE is really needed here |
|
1795 | 1733 |
if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l) | forceRerun) { |
1796 | 1734 |
GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l = |
1797 | 1735 |
.calculate_classificationThresholds(.asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background), |
... | ... |
@@ -1905,8 +1843,6 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05 |
1905 | 1843 |
|
1906 | 1844 |
######## Connections ######## |
1907 | 1845 |
|
1908 |
-# TODO: WHere in the code TFs that are not available for expression are filtered? |
|
1909 |
- |
|
1910 | 1846 |
#' Add TF-peak connections to a \code{\linkS4class{GRN}} object |
1911 | 1847 |
#' |
1912 | 1848 |
#' After the execution of this function, QC plots can be plotted with the function \code{\link{plotDiagnosticPlots_TFPeaks}} unless this has already been done by default due to \code{plotDiagnosticPlots = TRUE} |
... | ... |
@@ -1998,6 +1934,9 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails |
1998 | 1934 |
GRN@connections$TF_peaks[[permIndex]]$main = .optimizeSpaceGRN(stats::na.omit(resFDR.l[["main"]])) |
1999 | 1935 |
GRN@connections$TF_peaks[[permIndex]]$connectionStats = resFDR.l[["connectionStats"]] |
2000 | 1936 |
|
1937 |
+ futile.logger::flog.info(paste0("Finished. Stored ", nrow(GRN@connections$TF_peaks[[permIndex]]$main), " connections with an FDR <= ", maxFDRToStore)) |
|
1938 |
+ |
|
1939 |
+ |
|
2001 | 1940 |
# GC plots, empty when no GC correction should be done |
2002 | 1941 |
GRN@stats$plots_GC = resFDR.l[["plots_GC"]] |
2003 | 1942 |
rm(resFDR.l) |
... | ... |
@@ -2023,9 +1962,10 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails |
2023 | 1962 |
|
2024 | 1963 |
|
2025 | 1964 |
.computeTF_peak.fdr <- function(GRN, perm, connectionTypes, corMethod = "pearson", useGCCorrection = FALSE, |
2026 |
- removeNegativeCorrelation, maxFDRToStore = 0.3 , percBackground_size = 75, percBackground_resample = TRUE, plotDetails = FALSE) { |
|
1965 |
+ removeNegativeCorrelation, maxFDRToStore = 0.3 , percBackground_size = 75, percBackground_resample = TRUE, plotDetails = FALSE, backgroundSize_min = 1000) { |
|
2027 | 1966 |
|
2028 | 1967 |
start = Sys.time() |
1968 |
+ checkmate::assertIntegerish(backgroundSize_min, lower = 100) |
|
2029 | 1969 |
|
2030 | 1970 |
if (plotDetails) { |
2031 | 1971 |
message = "Plotting details is not supported currently. Set plotDetails = FALSE." |
... | ... |
@@ -2050,7 +1990,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails |
2050 | 1990 |
|
2051 | 1991 |
if (connectionTypeCur == "expression") { |
2052 | 1992 |
|
2053 |
- counts_connectionTypeCur = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm)) |
|
1993 |
+ counts_connectionTypeCur = getCounts(GRN, type = "rna", permuted = as.logical(perm)) |
|
2054 | 1994 |
|
2055 | 1995 |
} else { |
2056 | 1996 |
|
... | ... |
@@ -2062,7 +2002,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails |
2062 | 2002 |
|
2063 | 2003 |
futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts")) |
2064 | 2004 |
|
2065 |
- counts_peaks = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE) |
|
2005 |
+ counts_peaks = getCounts(GRN, type = "peaks", permuted = FALSE) |
|
2066 | 2006 |
|
2067 | 2007 |
# Filtering of the matrices happens automatically within the next function |
2068 | 2008 |
peaksCor.m = .correlateMatrices( matrix1= counts_connectionTypeCur, |
... | ... |
@@ -2095,7 +2035,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails |
2095 | 2035 |
|
2096 | 2036 |
pb <- progress::progress_bar$new(total = length(allTF)) |
2097 | 2037 |
|
2098 |
- peaksFiltered = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!.data$isFiltered) |
|
2038 |
+ peaksFiltered = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered) |
|
2099 | 2039 |
|
2100 | 2040 |
if (!useGCCorrection) { |
2101 | 2041 |
minPerc = 100 |
... | ... |
@@ -2164,9 +2104,8 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails |
2164 | 2104 |
|
2165 | 2105 |
targetNoPeaks = minPerc/100 * nPeaksBackground |
2166 | 2106 |
|
2167 |
- # TODO: Make 1000 a parameter stored somewhere |
|
2168 | 2107 |
# Ensure a minimum no of points in background, even if this sacrifices the mimicking of the distributions |
2169 |
- if (targetNoPeaks < 1000) { |
|
2108 |
+ if (targetNoPeaks < backgroundSize_min) { |
|
2170 | 2109 |
|
2171 | 2110 |
if (!is.null(percBackground_size)) { |
2172 | 2111 |
#futile.logger::flog.warn(paste0("Number of peaks in background is smaller than 1000 for TF ", TFCur, ". Increasing in steps of 5% until a value > 1000 is found.This warning results from a too low value for the parameter percBackground_size")) |
... | ... |
@@ -2401,15 +2340,13 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails |
2401 | 2340 |
|
2402 | 2341 |
} # end for each TF |
2403 | 2342 |
|
2404 |
- futile.logger::flog.info(paste0("Stored ", nrow(tblFilt.df), " connections with an FDR <= ", maxFDRToStore)) |
|
2405 |
- |
|
2406 | 2343 |
|
2407 | 2344 |
.printExecutionTime(start2, prefix = " ") |
2408 | 2345 |
|
2409 | 2346 |
} # end for each connectionType |
2410 | 2347 |
|
2411 | 2348 |
|
2412 |
- .printExecutionTime(start) |
|
2349 |
+ .printExecutionTime(start, prefix = "") |
|
2413 | 2350 |
list(main = data.table::rbindlist(connections_all.l), |
2414 | 2351 |
connectionStats = data.table::rbindlist(connectionStats_all.l), |
2415 | 2352 |
plots_GC = plots_GC.l |
... | ... |
@@ -2646,14 +2583,23 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
2646 | 2583 |
|
2647 | 2584 |
# correct ranges if within the promoterRange from the chr. starts and ends |
2648 | 2585 |
query = GenomicRanges::trim(query) |
2586 |
+ |
|
2587 |
+ subject = GRN@annotation$genes %>% |
|
2588 |
+ dplyr::select(-.data$gene.mean, -.data$gene.median, -.data$gene.CV) %>% |
|
2589 |
+ dplyr::filter(!is.na(.data$gene.start), !is.na(.data$gene.end)) |
|
2649 | 2590 |
|
2650 |
- subject = .getKnownGeneAnnotationNew(GRN, gene.types = gene.types, extendRegions = NULL) |
|
2591 |
+ if (!is.null(gene.types)) { |
|
2592 |
+ if (! "all" %in% gene.types) { |
|
2593 |
+ subject = dplyr::filter(subject, .data$gene.type %in% gene.types) |
|
2594 |
+ } |
|
2595 |
+ } |
|
2596 |
+ |
|
2651 | 2597 |
|
2652 | 2598 |
requiredColnames = c("gene.ENSEMBL","gene.type", "gene.name") |
2653 | 2599 |
checkmate::assertSubset(requiredColnames, colnames(subject), empty.ok = FALSE) |
2654 | 2600 |
|
2655 | 2601 |
subject.gr = GenomicRanges::makeGRangesFromDataFrame(subject, keep.extra.columns = TRUE) |
2656 |
- |
|
2602 |
+ |
|
2657 | 2603 |
if (overlapTypeGene == "TSS") { |
2658 | 2604 |
# Take only the 5' end of the gene (start site and NOT the full gene length) |
2659 | 2605 |
end(subject.gr) = start(subject.gr) |
... | ... |
@@ -2725,16 +2671,12 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
2725 | 2671 |
|
2726 | 2672 |
genomeAssembly = GRN@config$parameters$genomeAssembly |
2727 | 2673 |
|
2728 |
- # checkmate::assertSubset(gene.types, c("all", names(.getAllGeneTypesAndFrequencies(GRN@config$parameters$genomeAssembly, verbose = FALSE))), empty.ok = FALSE) |
|
2729 |
- |
|
2674 |
+ |
|
2730 | 2675 |
if (is.null(nCores)) { |
2731 | 2676 |
nCores = 1 |
2732 | 2677 |
} |
2733 | 2678 |
|
2734 |
- |
|
2735 |
- #consensusPeaks = .createDF_fromCoordinates(getCounts(GRN, type = "peaks", norm = TRUE, 0)$peakID, "peakID") |
|
2736 |
- |
|
2737 |
- consensusPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!.data$isFiltered) |
|
2679 |
+ consensusPeaks = GRN@data$peaks$counts_metadata %>% dplyr::filter(!.data$isFiltered) |
|
2738 | 2680 |
# Preprocess TAD boundaries |
2739 | 2681 |
if (!is.null(TADs)) { |
2740 | 2682 |
|
... | ... |
@@ -2853,12 +2795,12 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
2853 | 2795 |
|
2854 | 2796 |
permIndex = as.character(perm) |
2855 | 2797 |
|
2856 |
- countsPeaks.clean = dplyr::select(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE), -.data$peakID) |
|
2857 |
- countsRNA.clean = dplyr::select(getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm)), -.data$ENSEMBL) |
|
2798 |
+ countsPeaks.clean = getCounts(GRN, type = "peaks", permuted = FALSE, includeIDColumn = FALSE) |
|
2799 |
+ countsRNA.clean = getCounts(GRN, type = "rna", permuted = as.logical(perm), includeIDColumn = FALSE) |
|
2858 | 2800 |
|
2859 | 2801 |
# Cleverly construct the count matrices so we do the correlations in one go |
2860 |
- map_peaks = match(overlaps.sub.filt.df$peak.ID, getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID) |
|
2861 |
- map_rna = match(overlaps.sub.filt.df$gene.ENSEMBL, getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))$ENSEMBL) # may contain NA values because the gene is not actually in the RNA-seq counts |
|
2802 |
+ map_peaks = match(overlaps.sub.filt.df$peak.ID, getCounts(GRN, type = "peaks", permuted = FALSE)$peakID) |
|
2803 |
+ map_rna = match(overlaps.sub.filt.df$gene.ENSEMBL, getCounts(GRN, type = "rna", permuted = as.logical(perm))$ENSEMBL) # may contain NA values because the gene is not actually in the RNA-seq counts |
|
2862 | 2804 |
|
2863 | 2805 |
# There should not b any NA because it is about the peaks |
2864 | 2806 |
stopifnot(all(!is.na(map_peaks))) |
... | ... |
@@ -2901,8 +2843,8 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
2901 | 2843 |
|
2902 | 2844 |
# Make data frame and adjust p-values |
2903 | 2845 |
res.df = suppressMessages(tibble::as_tibble(res.m) %>% |
2904 |
- dplyr::mutate(peak.ID = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID[map_peaks], |
|
2905 |
- gene.ENSEMBL = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))$ENSEMBL[map_rna]) %>% |
|
2846 |
+ dplyr::mutate(peak.ID = getCounts(GRN, type = "peaks", permuted = FALSE)$peakID[map_peaks], |
|
2847 |
+ gene.ENSEMBL = getCounts(GRN, type = "rna", permuted = as.logical(perm))$ENSEMBL[map_rna]) %>% |
|
2906 | 2848 |
dplyr::filter(!is.na(.data$gene.ENSEMBL)) %>% # For some peak-gene combinations, no RNA-Seq data was available, these NAs are filtered |
2907 | 2849 |
# Add gene annotation and distance |
2908 | 2850 |
dplyr::left_join(overlaps.sub.filt.df, by = c("gene.ENSEMBL", "peak.ID")) %>% |
... | ... |
@@ -2999,9 +2941,9 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
2999 | 2941 |
# TODO: perm and GRN not known here |
3000 | 2942 |
stop("Not implemeneted yet") |
3001 | 2943 |
# g = ggplot2::ggplot(dataCur.df, ggplot2::aes(data1, data2)) + ggplot2::geom_point() + ggplot2::geom_smooth(method ="lm") + |
3002 |
- # ggplot2::ggtitle(paste0(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID[map1[i]], |
|
2944 |
+ # ggplot2::ggtitle(paste0(getCounts(GRN, type = "peaks", permuted = FALSE)$peakID[map1[i]], |
|
3003 | 2945 |
# ", ", |
3004 |
- # getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))$ENSEMBL[map2 [i]], |
|
2946 |
+ # getCounts(GRN, type = "rna", permuted = as.logical(perm))$ENSEMBL[map2 [i]], |
|
3005 | 2947 |
# ", p = ", round(res$p.value, 3))) + |
3006 | 2948 |
# ggplot2::theme_bw() |
3007 | 2949 |
# plot(g) |
... | ... |
@@ -3028,8 +2970,9 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
3028 | 2970 |
#' First, filtering both TF-peak and peak-gene connections according to different criteria such as FDR and other properties |
3029 | 2971 |
#' Second, joining the three major elements that an eGRN consist of (TFs, peaks, genes) into one data frame, with one row per unique TF-peak-gene connection. |
3030 | 2972 |
#' \strong{After successful execution, the connections (along with additional feature metadata) can be retrieved with the function \code{\link{getGRNConnections}}.} |
3031 |
-#' \strong{Note that the eGRN graph is reset upon successful execution of this function, and re-running the function \code{\link{build_eGRN_graph}} |
|
3032 |
-#' is necessary when any of the network functions of the package shall be executed.} |
|
2973 |
+#' \strong{Note that a previously stored eGRN graph is reset upon successful execution of this function along with printing a descriptive warning, |
|
2974 |
+#' and re-running the function \code{\link{build_eGRN_graph}} is necessary when any of the network functions of the package shall be executed. |
|
2975 |
+#' If the filtered connections changed, all network related enrichment functions also have to be rerun.} |
|
3033 | 2976 |
|
3034 | 2977 |
#' Internally, before joining them, both TF-peak links and peak-gene connections are filtered separately for reasons of memory and computational efficacy: |
3035 | 2978 |
#' First filtering out unwanted links dramatically reduces the memory needed for the full eGRN. Peak-gene p-value adjustment is only done after all filtering steps on the remaining set of |
... | ... |
@@ -3057,6 +3000,8 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = " |
3057 | 3000 |
#' @param filterPeaks Character vector. Default \code{NULL}. Vector of peak IDs (as named in the GRN object) to retain. All peaks not listed will be filtered out. |
3058 | 3001 |
#' @param TF_peak_FDR_selectViaCorBins \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Use a modified procedure for selecting TF-peak links that is based on the user-specified FDR but that retains also links that may have a higher FDR but a more extreme correlation. |
3059 | 3002 |
#' @param silent \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Print progress messages and filter statistics. |
3003 |
+#' @param resetGraphAndStoreInternally \code{TRUE} or \code{FALSE}. Default \code{TRUE}. If set to \code{TRUE}, the stored eGRN graph slot graph) is reset due to the potentially changed connections that |
|
3004 |
+#' would otherwise cause conflicts in the information stored in the object. Also, a GRN object is returned. If set to \code{FALSE}, only the new filtered connections are returned and the object is not altered. |
|
3060 | 3005 |
#' @param filterLoops \code{TRUE} or \code{FALSE}. Default \code{TRUE}. If a TF regulates itself (i.e., the TF and the gene are the same entity), should such loops be filtered from the GRN? |
3061 | 3006 |
#' @template outputFolder |
3062 | 3007 |
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function. |
... | ... |
@@ -3090,6 +3035,7 @@ filterGRNAndConnectGenes <- function(GRN, |
3090 | 3035 |
TF_peak_FDR_selectViaCorBins = FALSE, |
3091 | 3036 |
filterLoops = TRUE, |
3092 | 3037 |
outputFolder = NULL, |
3038 |
+ resetGraphAndStoreInternally = TRUE, |
|
3093 | 3039 |
silent = FALSE) { |
3094 | 3040 |
|
3095 | 3041 |
start = Sys.time() |
... | ... |
@@ -3112,6 +3058,7 @@ filterGRNAndConnectGenes <- function(GRN, |
3112 | 3058 |
|
3113 | 3059 |
checkmate::assert(checkmate::checkIntegerish(peak_gene.IHW.nbins, lower = 1), checkmate::checkSubset(peak_gene.IHW.nbins, "auto")) |
3114 | 3060 |
|
3061 |
+ checkmate::assertFlag(resetGraphAndStoreInternally) |
|
3115 | 3062 |
checkmate::assertFlag(silent) |
3116 | 3063 |
checkmate::assertChoice(peak_gene.selection, c("all", "closest")) |
3117 | 3064 |
checkmate::assertFlag(allowMissingTFs) |
... | ... |
@@ -3490,19 +3437,25 @@ filterGRNAndConnectGenes <- function(GRN, |
3490 | 3437 |
|
3491 | 3438 |
} #end for all permutations |
3492 | 3439 |
|
3493 |
- if (!is.null(GRN@graph)) { |
|
3494 |
- futile.logger::flog.info(paste0("Reset the graph slot in the object. Rerun the method build_eGRN_graph.")) |
|
3495 |
- GRN@graph = list() |
|
3440 |
+ |
|
3441 |
+ if (!is.null(GRN@graph) & resetGraphAndStoreInternally) { |
|
3442 |
+ message = "To avoid object inconsistencies and unexpected/non-reproducible results, the graph slot in the object has been reset. For all network-related functions as well as eGRN visualization, rerun the method build_eGRN_graph and all other network-related ans enrichment functions to update to the new set of filtered connections" |
|
3443 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) |
|
3444 |
+ GRN@graph = list() |
|
3496 | 3445 |
} |
3497 | 3446 |
|
3498 | 3447 |
|
3499 | 3448 |
|
3500 | 3449 |
if (silent) futile.logger::flog.threshold(futile.logger::INFO) |
3501 | 3450 |
|
3502 |
- .printExecutionTime(start) |
|
3503 |
- |
|
3504 |
- GRN |
|
3451 |
+ if (!silent) .printExecutionTime(start) |
|
3505 | 3452 |
|
3453 |
+ if (!resetGraphAndStoreInternally) { |
|
3454 |
+ return(GRN@connections$all.filtered) |
|
3455 |
+ } else { |
|
3456 |
+ return(GRN) |
|
3457 |
+ } |
|
3458 |
+ |
|
3506 | 3459 |
} |
3507 | 3460 |
|
3508 | 3461 |
|
... | ... |
@@ -3725,10 +3678,10 @@ add_TF_gene_correlation <- function(GRN, corMethod = "pearson", addRobustRegress |
3725 | 3678 |
futile.logger::flog.info(paste0(" Iterate through ", maxRow, " TF-gene combinations and (if possible) calculate correlations using ", nCores, " cores. This may take a few minutes.")) |
3726 | 3679 |
|
3727 | 3680 |
|
3728 |
- countsRNA.clean = dplyr::select(getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur)), -.data$ENSEMBL) |
|
3681 |
+ countsRNA.clean = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur), includeIDColumn = FALSE) |
|
3729 | 3682 |
|
3730 |
- map_TF = match(TF_genePairs$TF.ENSEMBL, getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))$ENSEMBL) |
|
3731 |
- map_gene = match(TF_genePairs$gene.ENSEMBL, getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))$ENSEMBL) |
|
3683 |
+ map_TF = match(TF_genePairs$TF.ENSEMBL, getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL) |
|
3684 |
+ map_gene = match(TF_genePairs$gene.ENSEMBL, getCounts(GRN, type = "rna",permuted = as.logical(permutationCur))$ENSEMBL) |
|
3732 | 3685 |
|
3733 | 3686 |
# Some NAs might be expected, given our annotation contains all known genes |
3734 | 3687 |
stopifnot(!all(is.na(map_TF))) |
... | ... |
@@ -3760,8 +3713,8 @@ add_TF_gene_correlation <- function(GRN, corMethod = "pearson", addRobustRegress |
3760 | 3713 |
|
3761 | 3714 |
# Make data frame and adjust p-values |
3762 | 3715 |
res.df = suppressMessages(tibble::as_tibble(res.m) %>% |
3763 |
- dplyr::mutate(TF.ENSEMBL = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))$ENSEMBL[map_TF], |
|
3764 |
- gene.ENSEMBL = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))$ENSEMBL[map_gene]) %>% |
|
3716 |
+ dplyr::mutate(TF.ENSEMBL = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL[map_TF], |
|
3717 |
+ gene.ENSEMBL = getCounts(GRN, type = "rna", permuted = as.logical(permutationCur))$ENSEMBL[map_gene]) %>% |
|
3765 | 3718 |
dplyr::filter(!is.na(.data$gene.ENSEMBL), !is.na(.data$TF.ENSEMBL)) %>% # For some peak-gene combinations, no RNA-Seq data was available, these NAs are filtered |
3766 | 3719 |
dplyr::left_join(GRN@annotation$TFs, by = c("TF.ENSEMBL")) %>% |
3767 | 3720 |
dplyr::select(tidyselect::all_of(selectColumns))) %>% |
... | ... |
@@ -3927,7 +3880,8 @@ addSNPOverlap <- function(grn, SNPData, col_chr = "chr", col_pos = "pos", col_pe |
3927 | 3880 |
|
3928 | 3881 |
#' Generate a summary for the number of connections for different filtering criteria for a \code{\linkS4class{GRN}} object. |
3929 | 3882 |
#' |
3930 |
-#' This functions calls \code{\link{filterGRNAndConnectGenes}} repeatedly and stores the total number of connections and other statistics each time to summarize them afterwards. All arguments are identical to the ones in \code{\link{filterGRNAndConnectGenes}}, see the help for this function for details. |
|
3883 |
+#' This functions calls \code{\link{filterGRNAndConnectGenes}} repeatedly and stores the total number of connections and other statistics each time to summarize them afterwards. |
|
3884 |
+#' All arguments are identical to the ones in \code{\link{filterGRNAndConnectGenes}}, see the help for this function for details. |
|
3931 | 3885 |
#' The function \code{\link{plot_stats_connectionSummary}} can be used afterwards for plotting. |
3932 | 3886 |
#' |
3933 | 3887 |
#' @export |
... | ... |
@@ -3942,6 +3896,7 @@ addSNPOverlap <- function(grn, SNPData, col_chr = "chr", col_pos = "pos", col_pe |
3942 | 3896 |
#' @param allowMissingTFs Logical vector of length 1 or 2. Default \code{c(FALSE)}. Allow TFs to be missing for TF-peak connections? If both \code{FALSE} and \code{TRUE} are given, the code loops over both |
3943 | 3897 |
#' @template forceRerun |
3944 | 3898 |
#' @seealso \code{\link{plot_stats_connectionSummary}} |
3899 |
+#' @seealso \code{\link{filterGRNAndConnectGenes}} |
|
3945 | 3900 |
#' @return An updated \code{\linkS4class{GRN}} object, with additional information added from this function. |
3946 | 3901 |
#' @examples |
3947 | 3902 |
#' # See the Workflow vignette on the GRaNIE website for examples |
... | ... |
@@ -3957,8 +3912,7 @@ generateStatsSummary <- function(GRN, |
3957 | 3912 |
gene.types = c("protein_coding"), |
3958 | 3913 |
allowMissingGenes = c(FALSE, TRUE), |
3959 | 3914 |
allowMissingTFs = c(FALSE), |
3960 |
- forceRerun = FALSE |
|
3961 |
-) { |
|
3915 |
+ forceRerun = FALSE) { |
|
3962 | 3916 |
|
3963 | 3917 |
start = Sys.time() |
3964 | 3918 |
|
... | ... |
@@ -4033,7 +3987,7 @@ generateStatsSummary <- function(GRN, |
4033 | 3987 |
futile.logger::flog.debug(paste0(" TF_peak.connectionType = ", TF_peak.connectionTypeCur)) |
4034 | 3988 |
|
4035 | 3989 |
futile.logger::flog.threshold(futile.logger::WARN) |
4036 |
- GRN = filterGRNAndConnectGenes(GRN, |
|
3990 |
+ con.filt.l = filterGRNAndConnectGenes(GRN, |
|
4037 | 3991 |
TF_peak.fdr.threshold = TF_peak.fdr_cur, |
4038 | 3992 |
TF_peak.connectionTypes = TF_peak.connectionTypeCur, |
4039 | 3993 |
peak_gene.p_raw.threshold = NULL, |
... | ... |
@@ -4043,12 +3997,12 @@ generateStatsSummary <- function(GRN, |
4043 | 3997 |
allowMissingTFs = allowMissingTFsCur, |
4044 | 3998 |
peak_gene.r_range = peak_gene.r_range, |
4045 | 3999 |
filterTFs = NULL, filterGenes = NULL, filterPeaks = NULL, |
4046 |
- silent = FALSE) |
|
4000 |
+ resetGraphAndStoreInternally = FALSE, |
|
4001 |
+ silent = TRUE) |
|
4047 | 4002 |
|
4048 | 4003 |
futile.logger::flog.threshold(futile.logger::INFO) |
4049 | 4004 |
|
4050 |
- |
|
4051 |
- results.l = .addStats(GRN@stats$connections, GRN@connections$all.filtered[[permIndex]], |
|
4005 |
+ results.l = .addStats(GRN@stats$connections, con.filt.l[[permIndex]], |
|
4052 | 4006 |
perm = permutationCur, |
4053 | 4007 |
TF_peak.fdr = TF_peak.fdr_cur, TF_peak.connectionType = TF_peak.connectionTypeCur, |
4054 | 4008 |
peak_gene.p_raw = NA, |
... | ... |
@@ -4245,6 +4199,7 @@ generateStatsSummary <- function(GRN, |
4245 | 4199 |
loadExampleObject <- function(forceDownload = FALSE, fileURL = "https://www.embl.de/download/zaugg/GRaNIE/GRN.rds") { |
4246 | 4200 |
|
4247 | 4201 |
checkmate::assertFlag(forceDownload) |
4202 |
+ options(timeout=200) |
|
4248 | 4203 |
|
4249 | 4204 |
if (!is.installed("BiocFileCache")) { |
4250 | 4205 |
|
... | ... |
@@ -4304,20 +4259,18 @@ loadExampleObject <- function(forceDownload = FALSE, fileURL = "https://www.embl |
4304 | 4259 |
#' |
4305 | 4260 |
#' @template GRN |
4306 | 4261 |
#' @param type Character. Either \code{peaks} or \code{rna}. \code{peaks} corresponds to the counts for the open chromatin data, while \code{rna} refers to th RNA-seq counts. If set to \code{rna}, both permuted and non-permuted data can be retrieved, while for \code{peaks}, only the non-permuted one (i.e., 0) can be retrieved. |
4307 |
-#' @param norm Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. Should original (often raw, but this may not necessarily be the case) or normalized counts be returned? |
|
4308 | 4262 |
#' @param asMatrix Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. If set to \code{FALSE}, counts are returned as a data frame with or without an ID column (see \code{includeIDColumn}). If set to \code{TRUE}, counts are returned as a matrix with the ID column as row names. |
4309 | 4263 |
#' @param includeIDColumn Logical. \code{TRUE} or \code{FALSE}. Default \code{TRUE}. Only relevant if \code{asMatrix = FALSE}. If set to \code{TRUE}, an explicit ID column is returned (no row names). If set to \code{FALSE}, the IDs are in the row names instead. |
4264 |
+#' @param includeFiltered Logical. \code{TRUE} or \code{FALSE}. Default \code{FALSE}. If set to \code{FALSE}, genes or peaks marked as filtered (after running the function \code{filterData}) will not be returned. If set to \code{TRUE}, all elements are returned regardless of the currently active filter status. |
|
4310 | 4265 |
#' @template permuted |
4311 | 4266 |
#' @export |
4312 | 4267 |
#' @import tibble |
4313 | 4268 |
#' @examples |
4314 | 4269 |
#' # See the Workflow vignette on the GRaNIE website for examples |
4315 | 4270 |
#' GRN = loadExampleObject() |
4316 |
-#' counts.df = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE) |
|
4271 |
+#' counts.df = getCounts(GRN, type = "peaks", permuted = FALSE) |
|
4317 | 4272 |
#' @return Data frame of counts, with the type as indicated by the function parameters. This function does **NOT** return a \code{\linkS4class{GRN}} object. |
4318 |
- |
|
4319 |
-# TODO document and implement two new arguments here |
|
4320 |
-getCounts <- function(GRN, type, norm = FALSE, permuted = FALSE, asMatrix = FALSE, includeIDColumn = TRUE) { |
|
4273 |
+getCounts <- function(GRN, type, permuted = FALSE, asMatrix = FALSE, includeIDColumn = TRUE, includeFiltered = FALSE) { |
|
4321 | 4274 |
|
4322 | 4275 |
checkmate::assertClass(GRN, "GRN") |
4323 | 4276 |
GRN = .addFunctionLogToObject(GRN) |
... | ... |
@@ -4325,92 +4278,86 @@ getCounts <- function(GRN, type, norm = FALSE, permuted = FALSE, asMatrix = FALS |
4325 | 4278 |
GRN = .makeObjectCompatible(GRN) |
4326 | 4279 |
|
4327 | 4280 |
checkmate::assertChoice(type, c("peaks", "rna")) |
4328 |
- checkmate::assertFlag(norm)# |
|
4329 | 4281 |
checkmate::assertFlag(permuted) |
4330 | 4282 |
|
4331 | 4283 |
permIndex = dplyr::if_else(permuted, "1", "0") |
4332 | 4284 |
|
4333 |
- if (!permuted) { |
|
4334 |
- if (type == "peaks") { |
|
4335 |
- |
|
4336 |
- if (norm ) { |
|
4337 |
- result = GRN@data$peaks$counts_norm |
|
4338 |
- |
|
4339 |
- } else { |
|
4340 |
- # Handle case when this is null due to already norm. counts as input |
|
4341 |
- |
|
4342 |
- # TODO: Unify the annotation column |
|
4343 |
- classPeaks = class(GRN@data$peaks$counts_orig) |
|
4344 |
- if ("DESeqDataSet" %in% classPeaks) { |
|
4345 |
- result = DESeq2::counts(GRN@data$peaks$counts_orig, norm = FALSE) %>% as.data.frame() %>% tibble::rownames_to_column("peakID") |
|
4346 |
- } else if ("matrix" %in% classPeaks) { |
|
4347 |
- result = GRN@data$peaks$counts_orig %>% dplyr::select(-.data$isFiltered) %>% as.data.frame() |
|
4348 |
- } else if ("dgCMatrix" %in% classPeaks) { |
|
4349 |
- result = .asMatrixFromSparse(GRN@data$peaks$counts_orig, convertZero_to_NA = FALSE) %>% dplyr::select(-.data$isFiltered) %>% as.data.frame() |
|
4350 |
- } else { |
|
4351 |
- message = paste0("Unsupported class for GRN@data$peaks$counts_orig. Contact the authors.") |
|
4352 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
4353 |
- } |
|
4354 |
- |
|
4355 |
- } |
|
4356 |
- |
|
4357 |
- |
|
4358 |
- } else if (type == "rna") { |
|
4359 |
- |
|
4360 |
- if (norm) { |
|
4361 |
- result = GRN@data$RNA$counts_norm.l[[permIndex]] |
|
4362 |
- |
|
4363 |
- } else { |
|
4364 |
- # TODO: Unify the annotation column |
|
4365 |
- classRNA = class(GRN@data$RNA$counts_orig) |
|
4366 |
- if ("DESeqDataSet" %in% classRNA) { |
|
4367 |
- result = DESeq2::counts(GRN@data$RNA$counts_orig, norm = FALSE) %>% |
|
4368 |
- as.data.frame() %>% tibble::rownames_to_column("ENSEMBL") |
|
4369 |
- } else if ("matrix" %in% classRNA) { |
|
4370 |
- result = GRN@data$RNA$counts_orig %>% |
|
4371 |
- dplyr::select(-.data$isFiltered) %>% as.data.frame() |
|
4372 |
- } else if ("dgCMatrix" %in% classRNA) { |
|
4373 |
- result = .asMatrixFromSparse(GRN@data$RNA$counts_orig, convertZero_to_NA = FALSE) %>% |
|
4374 |
- dplyr::select(-.data$isFiltered) %>% as.data.frame() %>% tibble::rownames_to_column("ENSEMBL") |
|
4375 |
- } else { |
|
4376 |
- message = paste0("Unsupported class for GRN@data$RNA$counts_orig. Contact the authors.") |
|
4377 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
4378 |
- } |
|
4379 |
- |
|
4380 |
- } |
|
4381 |
- |
|
4382 |
- } |
|
4383 |
- |
|
4384 |
- } else { # permutation 1 |
|
4285 |
+ if (type == "peaks") { |
|
4385 | 4286 |
|
4386 |
- if (type == "peaks") { |
|
4287 |
+ if (permuted) { |
|
4387 | 4288 |
message = "Could not find permuted peak counts in GRN object. Peaks are not stored as permuted, set permuted = FALSE for type = \"peaks\"" |
4388 | 4289 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
4290 |
+ } |
|
4291 |
+ |
|
4292 |
+ classPeaks = class(GRN@data$peaks$counts) |
|
4293 |
+ if ("matrix" %in% classPeaks) { |
|
4294 |
+ result = GRN@data$peaks$counts |
|
4295 |
+ } else if ("dgCMatrix" %in% classPeaks) { |
|
4296 |
+ result = .asMatrixFromSparse(GRN@data$peaks$counts, convertZero_to_NA = FALSE) |
|
4297 |
+ } else { |
|
4298 |
+ message = paste0("Unsupported class for GRN@data$peaks$counts. Contact the authors.") |
|
4299 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
4300 |
+ } |
|
4301 |
+ |
|
4302 |
+ |
|
4303 |
+ } else if (type == "rna") { |
|
4304 |
+ |
|
4305 |
+ classRNA = class(GRN@data$RNA$counts) |
|
4306 |
+ if ("matrix" %in% classRNA) { |
|
4307 |
+ result = GRN@data$RNA$counts |
|
4308 |
+ } else if ("dgCMatrix" %in% classRNA) { |
|
4309 |
+ result = .asMatrixFromSparse(GRN@data$RNA$counts, convertZero_to_NA = FALSE) |
|
4389 | 4310 |
} else { |
4311 |
+ message = paste0("Unsupported class for GRN@data$RNA$counts. Contact the authors.") |
|
4312 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
4313 |
+ } |
|
4314 |
+ |
|
4315 |
+ } |
|
4316 |
+ |
|
4317 |
+ |
|
4318 |
+ if (permuted) { |
|
4319 |
+ |
|
4320 |
+ if (type == "rna") { |
|
4390 | 4321 |
|
4391 |
- if (norm) { |
|
4392 |
- shuffledColumnOrder = GRN@data$RNA$counts_norm.l[[permIndex]] |
|
4393 |
- |
|
4394 |
- # Columns are shuffled so that non-matching samples are compared throughout the pipeline for all correlation-based analyses |
|
4395 |
- result = GRN@data$RNA$counts_norm.l[["0"]][shuffledColumnOrder] %>% |
|
4396 |
- dplyr::select(.data$ENSEMBL, tidyselect::everything()) |
|
4397 |
- |
|
4398 |
- } else { |
|
4399 |
- message= "Could not find counts in object." |
|
4400 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
4401 |
- } |
|
4322 |
+ # Columns are shuffled so that non-matching samples are compared throughout the pipeline for all correlation-based analyses |
|
4323 |
+ colnames(result) = colnames(result)[GRN@data$RNA$counts_permuted_index] |
|
4402 | 4324 |
} |
4325 |
+ } |
|
4326 |
+ |
|
4327 |
+ if (!includeFiltered) { |
|
4328 |
+ |
|
4329 |
+ if (type == "rna") { |
|
4330 |
+ nonFiltered = which(! GRN@data$RNA$counts_metadata$isFiltered) |
|
4331 |
+ } else { |
|
4332 |
+ nonFiltered = which(! GRN@data$peaks$counts_metadata$isFiltered) |
|
4333 |
+ } |
|
4334 |
+ |
|
4335 |
+ result = result[nonFiltered,] |
|
4403 | 4336 |
} |
4337 |
+ |
|
4338 |
+ |
|
4404 | 4339 |
|
4405 |
- if ("isFiltered" %in% colnames(result)) { |
|
4406 |
- result = dplyr::filter(result, !.data$isFiltered) %>% dplyr::select(-.data$isFiltered) |
|
4340 |
+ if (!asMatrix) { |
|
4341 |
+ |
|
4342 |
+ result.df = result %>% |
|
4343 |
+ as.data.frame() |
|
4344 |
+ |
|
4345 |
+ if (includeIDColumn) { |
|
4346 |
+ |
|
4347 |
+ ID_column = dplyr::if_else(type == "rna", "ENSEMBL", "peakID") |
|
4348 |
+ |
|
4349 |
+ result.df = result.df %>% |
|
4350 |
+ tibble::rownames_to_column(ID_column) |
|
4351 |
+ } |
|
4352 |
+ |
|
4353 |
+ return(result.df) |
|
4354 |
+ |
|
4407 | 4355 |
} |
4408 | 4356 |
|
4409 | 4357 |
result |
4410 | 4358 |
} |
4411 | 4359 |
|
4412 | 4360 |
|
4413 |
- |
|
4414 | 4361 |
#' Extract connections or links from a \code{\linkS4class{GRN}} object as da data frame. |
4415 | 4362 |
#' |
4416 | 4363 |
#' Returns stored connections/links (either TF-peak, peak-genes or the filtered set of connections as produced by \code{\link{filterGRNAndConnectGenes}}). |
... | ... |
@@ -4479,7 +4426,7 @@ getGRNConnections <- function(GRN, type = "all.filtered", permuted = FALSE, |
4479 | 4426 |
if (include_variancePartitionResults) { |
4480 | 4427 |
|
4481 | 4428 |
if (ncol(GRN@annotation$genes %>% dplyr::select(tidyselect::starts_with("variancePartition"))) == 0 | |
4482 |
- ncol(GRN@annotation$consensusPeaks %>% dplyr::select(tidyselect::starts_with("variancePartition"))) == 0) { |
|
4429 |
+ ncol(GRN@annotation$peaks %>% dplyr::select(tidyselect::starts_with("variancePartition"))) == 0) { |
|
4483 | 4430 |
message = "Please run the function add_featureVariation first. " |
4484 | 4431 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
4485 | 4432 |
} |
... | ... |
@@ -4491,7 +4438,7 @@ getGRNConnections <- function(GRN, type = "all.filtered", permuted = FALSE, |
4491 | 4438 |
dplyr::left_join(GRN@annotation$genes %>% |
4492 | 4439 |
dplyr::select(.data$gene.ENSEMBL, tidyselect::starts_with("gene.variancePartition")), |
4493 | 4440 |
by = "gene.ENSEMBL") %>% |
4494 |
- dplyr::left_join(GRN@annotation$consensusPeaks %>% |
|
4441 |
+ dplyr::left_join(GRN@annotation$peaks %>% |
|
4495 | 4442 |
dplyr::select(.data$peak.ID, tidyselect::starts_with("peak.variancePartition")), |
4496 | 4443 |
by = "peak.ID") |
4497 | 4444 |
|
... | ... |
@@ -4508,10 +4455,10 @@ getGRNConnections <- function(GRN, type = "all.filtered", permuted = FALSE, |
4508 | 4455 |
|
4509 | 4456 |
if (include_peakMetadata) { |
4510 | 4457 |
|
4511 |
- colsMissing = setdiff(colnames(GRN@annotation$consensusPeaks), colnames(merged.df)) |
|
4458 |
+ colsMissing = setdiff(colnames(GRN@annotation$peaks), colnames(merged.df)) |
|
4512 | 4459 |
if (length(colsMissing) > 0) { |
4513 | 4460 |
merged.df = merged.df %>% |
4514 |
- dplyr::left_join(GRN@annotation$consensusPeaks, by = "peak.ID") |
|
4461 |
+ dplyr::left_join(GRN@annotation$peaks, by = "peak.ID") |
|
4515 | 4462 |
} |
4516 | 4463 |
} |
4517 | 4464 |
|
... | ... |
@@ -4809,27 +4756,27 @@ getBasic_metadata_visualization <- function(GRN, forceRerun = FALSE) { |
4809 | 4756 |
|
4810 | 4757 |
if (is.null(GRN@visualization$metadata) | forceRerun) { |
4811 | 4758 |
|
4812 |
- expMeans.m = getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE) %>% dplyr::select(-.data$ENSEMBL) %>% as.matrix() |
|
4759 |
+ expMeans.m = getCounts(GRN, type = "rna", permuted = FALSE, asMatrix = TRUE) |
|
4813 | 4760 |
|
4814 | 4761 |
baseMean = rowMeans(expMeans.m) |
4815 |
- expression.df = tibble::tibble(ENSEMBL_ID = getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE)$ENSEMBL, baseMean = baseMean) %>% |
|
4762 |
+ expression.df = tibble::tibble(ENSEMBL_ID = getCounts(GRN, type = "rna", permuted = FALSE, includeIDColumn = TRUE)$ENSEMBL, baseMean = baseMean) %>% |
|
4816 | 4763 |
dplyr::mutate(ENSEMBL_ID = gsub("\\..+", "", .data$ENSEMBL_ID, perl = TRUE), |
4817 | 4764 |
baseMean_log = log2(baseMean + 0.01)) |
4818 | 4765 |
|
4819 | 4766 |
expression_TF.df = dplyr::filter(expression.df, .data$ENSEMBL_ID %in% GRN@annotation$TFs$TF.ENSEMBL) %>% |
4820 | 4767 |
dplyr::left_join(GRN@annotation$TFs, by = c("ENSEMBL_ID" = "TF.ENSEMBL")) |
4821 | 4768 |
|
4822 |
- meanPeaks.df = tibble::tibble(peakID = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID, |
|
4823 |
- mean = rowMeans(dplyr::select(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE), -.data$peakID))) %>% |
|
4769 |
+ meanPeaks.df = tibble::tibble(peakID = getCounts(GRN, type = "peaks", permuted = FALSE)$peakID, |
|
4770 |
+ mean = rowMeans(getCounts(GRN, type = "peaks", permuted = FALSE, asMatrix = TRUE))) %>% |
|
4824 | 4771 |
dplyr::mutate(mean_log = log2(mean + 0.01)) |
4825 | 4772 |
|
4826 |
- metadata = list("RNA_expression_genes" = expression.df, |
|
4773 |
+ GRN@visualization$metadata = list("RNA_expression_genes" = expression.df, |
|
4827 | 4774 |
"RNA_expression_TF" = expression_TF.df, |
4828 | 4775 |
"Peaks_accessibility" = meanPeaks.df) |
4829 | 4776 |
|
4830 | 4777 |
} |
4831 | 4778 |
|
4832 |
- metadata |
|
4779 |
+ GRN |
|
4833 | 4780 |
|
4834 | 4781 |
} |
4835 | 4782 |
|
... | ... |
@@ -5113,22 +5060,17 @@ add_featureVariation <- function (GRN, |
5113 | 5060 |
futile.logger::flog.info(paste0("Using ", length(peaksToInclude), " peaks and ", length(genesToInclude), " TF/genes from the filtered connections.")) |
5114 | 5061 |
|
5115 | 5062 |
} else if (features == "all") { |
5116 |
- genesToInclude = GRN@data$RNA$counts_norm.l[["0"]]$ENSEMBL |
|
5117 |
- peaksToInclude = GRN@data$peaks$counts_norm$peakID |
|
5063 |
+ genesToInclude = GRN@data$RNA$counts_metadata$ENSEMBL |
|
5064 |
+ peaksToInclude = GRN@data$peaks$counts_metadata$peakID |
|
5118 | 5065 |
|
5119 | 5066 |
futile.logger::flog.info(paste0("Using all ", length(peaksToInclude), " peaks and ", length(genesToInclude), " TF/genes. This may take a long time. Consider setting features = \"all_filtered\".")) |
5120 | 5067 |
} |
5121 | 5068 |
|
5122 | 5069 |
data.l = list() |
5123 |
- data.l[["RNA"]] <- GRN@data$RNA$counts_norm.l[["0"]] %>% |
|
5124 |
- dplyr::filter(!.data$isFiltered, .data$ENSEMBL %in% genesToInclude) %>% |
|
5125 |
- dplyr::select(-.data$isFiltered) %>% |
|
5126 |
- tibble::column_to_rownames(var = "ENSEMBL") |
|
5127 |
- |
|
5128 |
- data.l[["peaks"]] <- GRN@data$peaks$counts_norm %>% |
|
5129 |
- dplyr::filter(!.data$isFiltered, .data$peakID %in% peaksToInclude) %>% |
|
5130 |
- dplyr::select(-.data$isFiltered) %>% |
|
5131 |
- tibble::column_to_rownames(var = "peakID") |
|
5070 |
+ #TODO |
|
5071 |
+ data.l[["RNA"]] <- getCounts(GRN, type = "rna", permuted = FALSE, includeFiltered = FALSE, includeIDColumn = FALSE) |
|
5072 |
+ |
|
5073 |
+ data.l[["peaks"]] <- getCounts(GRN, type = "peaks", permuted = FALSE, includeFiltered = FALSE, includeIDColumn = FALSE) |
|
5132 | 5074 |
|
5133 | 5075 |
|
5134 | 5076 |
for (dataType in c("RNA", "peaks")) { |
... | ... |
@@ -5169,7 +5111,7 @@ add_featureVariation <- function (GRN, |
5169 | 5111 |
|
5170 | 5112 |
colnames(res)[2:ncol(res)] = paste0("peak.", colnames(res)[2:ncol(res)]) |
5171 | 5113 |
|
5172 |
- GRN@annotation$consensusPeaks = GRN@annotation$consensusPeaks %>% |
|
5114 |
+ GRN@annotation$peaks = GRN@annotation$peaks %>% |
|
5173 | 5115 |
dplyr::select(-tidyselect::starts_with("variancePart")) %>% |
5174 | 5116 |
dplyr::left_join(res, by = c("peak.ID" = "ID")) |
5175 | 5117 |
} |
... | ... |
@@ -5198,34 +5140,88 @@ add_featureVariation <- function (GRN, |
5198 | 5140 |
} |
5199 | 5141 |
} |
5200 | 5142 |
|
5143 |
+# Converts from an older GRN object format to the most current one due to internal optimizations |
|
5201 | 5144 |
.makeObjectCompatible <- function(GRN) { |
5202 | 5145 |
|
5203 |
- if (is.null(GRN@annotation$TFs)) { |
|
5146 |
+ if (is.null(GRN@annotation$TFs) & !is.null(GRN@data$TFs$translationTable)) { |
|
5204 | 5147 |
GRN@annotation$TFs = GRN@data$TFs$translationTable |
5205 | 5148 |
GRN@data$TFs$translationTable = NULL |
5206 | 5149 |
} |
5207 | 5150 |
|
5208 |
- if (!"TF.ENSEMBL" %in% colnames(GRN@annotation$TFs)) { |
|
5209 |
- GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.ENSEMBL = .data$ENSEMBL) |
|
5210 |
- } |
|
5211 |
- if (!"TF.HOCOID" %in% colnames(GRN@annotation$TFs)) { |
|
5212 |
- GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.HOCOID = .data$HOCOID) |
|
5213 |
- } |
|
5151 |
+ if (!is.null(GRN@annotation$TFs)) { |
|
5214 | 5152 |
|
5215 |
- if (!"ENSEMBL" %in% colnames(GRN@annotation$TFs)) { |
|
5216 |
- GRN@annotation$TFs = dplyr::select(GRN@annotation$TFs, -.data$ENSEMBL) |
|
5153 |
+ if (!"TF.ENSEMBL" %in% colnames(GRN@annotation$TFs)) { |
|
5154 |
+ GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.ENSEMBL = .data$ENSEMBL) |
|
5155 |
+ } |
|
5156 |
+ if (!"TF.HOCOID" %in% colnames(GRN@annotation$TFs)) { |
|
5157 |
+ GRN@annotation$TFs = dplyr::rename(GRN@annotation$TFs, TF.HOCOID = .data$HOCOID) |
|
5158 |
+ } |
|
5159 |
+ |
|
5160 |
+ if ("ENSEMBL" %in% colnames(GRN@annotation$TFs)) { |
|
5161 |
+ GRN@annotation$TFs = dplyr::select(GRN@annotation$TFs, -.data$ENSEMBL) |
|
5162 |
+ } |
|
5217 | 5163 |
} |
5218 |
- |
|
5219 |
- |
|
5164 |
+ |
|
5220 | 5165 |
|
5221 | 5166 |
# Temporary fix: Replace lincRNA with lncRNA due to a recent change in biomaRt until we update the object directly |
5222 | 5167 |
if ("lncRNA" %in% levels(GRN@annotation$genes$gene.type)) { |
5223 | 5168 |
GRN@annotation$genes = dplyr::mutate(GRN@annotation$genes, gene.type = dplyr::recode_factor(.data$gene.type, lncRNA = "lincRNA")) |
5224 | 5169 |
} |
5225 |
- |
|
5226 | 5170 |
|
5171 |
+ if (is.null(GRN@annotation$peaks) & !is.null(GRN@annotation$consensusPeaks)) { |
|
5172 |
+ GRN@annotation[["peaks"]] = GRN@annotation[["consensusPeaks"]] |
|
5173 |
+ GRN@annotation[["consensusPeaks"]] = NULL |
|
5174 |
+ GRN@annotation[["peaks_obj"]] = GRN@annotation[["consensusPeaks_obj"]] |
|
5175 |
+ GRN@annotation[["consensusPeaks_obj"]] = NULL |
|
5176 |
+ |
|
5177 |
+ } |
|
5178 |
+ |
|
5179 |
+ # Renamed count slots and their structure |
|
5180 |
+ # 1. peaks |
|
5181 |
+ if (!is.null(GRN@data$peaks[["counts_orig"]])) { |
|
5182 |
+ GRN@data$peaks[["counts_orig"]] = NULL |
|
5183 |
+ } |
|
5184 |
+ if (is.null(GRN@data$peaks[["counts"]])) { |
|
5185 |
+ GRN@data$peaks[["counts"]] = .storeAsMatrixOrSparseMatrix(GRN, df = GRN@data$peaks$counts_norm %>% dplyr::select(-.data$isFiltered), |
|
5186 |
+ ID_column = "peakID", slotName = "GRN@data$peaks$counts") |
|
5187 |
+ } |
|
5188 |
+ if (!is.null(GRN@data$peaks[["counts_norm"]])) { |
|
5189 |
+ GRN@data$peaks[["counts_norm"]] = NULL |
|
5190 |
+ } |
|
5191 |
+ |
|
5192 |
+ if (is.null(GRN@data$peaks[["counts_metadata"]])) { |
|
5193 |
+ GRN@data$peaks[["counts_metadata"]] = .createConsensusPeaksDF(GRN@data$peaks[["counts"]] %>% as.data.frame() %>% tibble::rownames_to_column("peakID")) |
|
5194 |
+ stopifnot(c("chr", "start", "end", "peakID", "isFiltered") %in% colnames(GRN@data$peaks$counts_metadata)) |
|
5195 |
+ } |
|
5196 |
+ if (!is.null(GRN@data$peaks[["consensusPeaks"]])) { |
|
5197 |
+ GRN@data$peaks[["consensusPeaks"]] = NULL |
|
5198 |
+ } |
|
5199 |
+ |
|
5200 |
+ # 2. RNA |
|
5201 |
+ if (!is.null(GRN@data$RNA[["counts_orig"]])) { |
|
5202 |
+ GRN@data$RNA[["counts_orig"]] = NULL |
|
5203 |
+ } |
|
5204 |
+ if (is.null(GRN@data$RNA[["counts"]]) & !is.null(GRN@data$RNA$counts_norm.l[["0"]])) { |
|
5205 |
+ GRN@data$RNA[["counts"]] = .storeAsMatrixOrSparseMatrix(GRN, df = GRN@data$RNA$counts_norm.l[["0"]] %>% dplyr::select(-.data$isFiltered), |
|
5206 |
+ ID_column = "ENSEMBL", slotName = "GRN@data$RNA$counts") |
|
5207 |
+ } |
|
5208 |
+ if (!is.null(GRN@data$RNA[["counts_norm.l"]])) { |
|
5209 |
+ GRN@data$RNA[["counts_norm.l"]] = NULL |
|
5210 |
+ } |
|
5211 |
+ |
|
5212 |
+ if (is.null(GRN@data$RNA[["counts_metadata"]]) & !is.null(GRN@data$RNA$counts)) { |
|
5213 |
+ GRN@data$RNA[["counts_metadata"]] = tibble::tibble(ID = GRN@data$RNA$counts %>% as.data.frame() %>% tibble::rownames_to_column("ENSEMBL"), isFiltered = FALSE) |
|
5214 |
+ } |
|
5215 |
+ |
|
5216 |
+ if (is.null(GRN@data$RNA[["counts_permuted_index"]]) & !is.null(GRN@data$RNA$counts)) { |
|
5217 |
+ GRN@data$RNA[["counts_permuted_index"]] = .shuffleColumns(as.data.frame(GRN@data$RNA$counts), |
|
5218 |
+ .getMaxPermutation(GRN), returnUnshuffled = FALSE, returnAsList = FALSE) |
|
5219 |
+ } |
|
5220 |
+ |
|
5227 | 5221 |
|
5228 | 5222 |
GRN |
5229 | 5223 |
} |
5230 | 5224 |
|
5225 |
+ |
|
5226 |
+ |
|
5231 | 5227 |
|
5232 | 5228 |
\ No newline at end of file |
... | ... |
@@ -511,7 +511,7 @@ |
511 | 511 |
|
512 | 512 |
output.global.TFs.merged = output.global.TFs %>% |
513 | 513 |
dplyr::filter(!!as.name(colnameClassificationCur) != "not-expressed") %>% |
514 |
- dplyr::full_join(TF.specific, by = c( "TF" = "HOCOID")) %>% |
|
514 |
+ dplyr::full_join(TF.specific, by = c( "TF" = "TF.HOCOID")) %>% |
|
515 | 515 |
dplyr::mutate(baseMeanNorm = (.data$baseMean - min(.data$baseMean, na.rm = TRUE)) / (max(.data$baseMean, na.rm = TRUE) - min(.data$baseMean, na.rm = TRUE)) + par.l$minPointSize) %>% |
516 | 516 |
dplyr::filter(!is.na(!!as.name(colnameClassificationCur))) |
517 | 517 |
|
... | ... |
@@ -640,22 +640,22 @@ |
640 | 640 |
futile.logger::flog.info(paste0("Plotting AR heatmap", dplyr::if_else(is.null(file), "", paste0(" to file ", file)))) |
641 | 641 |
|
642 | 642 |
|
643 |
- missingGenes = which(!HOCOMOCO_mapping.df.exp$HOCOID %in% colnames(sort.cor.m)) |
|
643 |
+ missingGenes = which(!HOCOMOCO_mapping.df.exp$TF.HOCOID %in% colnames(sort.cor.m)) |
|
644 | 644 |
if (length(missingGenes) > 0) { |
645 |
- HOCOMOCO_mapping.df.exp = dplyr::filter(HOCOMOCO_mapping.df.exp, .data$HOCOID %in% colnames(sort.cor.m)) |
|
645 |
+ HOCOMOCO_mapping.df.exp = dplyr::filter(HOCOMOCO_mapping.df.exp, .data$TF.HOCOID %in% colnames(sort.cor.m)) |
|
646 | 646 |
} |
647 | 647 |
|
648 | 648 |
if (!is.null(file)) { |
649 | 649 |
grDevices::pdf(file, ...) |
650 | 650 |
} |
651 | 651 |
|
652 |
- cor.r.filt.m <- sort.cor.m[,as.character(HOCOMOCO_mapping.df.exp$HOCOID)] |
|
652 |
+ cor.r.filt.m <- sort.cor.m[,as.character(HOCOMOCO_mapping.df.exp$TF.HOCOID)] |
|
653 | 653 |
|
654 |
- stopifnot(identical(colnames( cor.r.filt.m), as.character(HOCOMOCO_mapping.df.exp$HOCOID))) |
|
654 |
+ stopifnot(identical(colnames( cor.r.filt.m), as.character(HOCOMOCO_mapping.df.exp$TF.HOCOID))) |
|
655 | 655 |
|
656 | 656 |
BREAKS = seq(-1,1,0.05) |
657 | 657 |
diffDensityMat = matrix(NA, nrow = ncol( cor.r.filt.m), ncol = length(BREAKS) - 1) |
658 |
- rownames(diffDensityMat) = HOCOMOCO_mapping.df.exp$HOCOID |
|
658 |
+ rownames(diffDensityMat) = HOCOMOCO_mapping.df.exp$TF.HOCOID |
|
659 | 659 |
|
660 | 660 |
TF_Peak_all.m <- TF.peakMatrix.df |
661 | 661 |
TF_Peak.m <- TF_Peak_all.m |
... | ... |
@@ -675,7 +675,7 @@ |
675 | 675 |
|
676 | 676 |
## check to what extent the number of TF motifs affects the density values |
677 | 677 |
n_min = dplyr::if_else(colSums(TF_Peak.m) < nrow(TF_Peak.m),colSums(TF_Peak.m), nrow(TF_Peak.m) - colSums(TF_Peak.m)) |
678 |
- names(n_min) = HOCOMOCO_mapping.df.exp$HOCOID#[match(names(n_min), as.character(tf2ensg$ENSEMBL))] |
|
678 |
+ names(n_min) = HOCOMOCO_mapping.df.exp$TF.HOCOID#[match(names(n_min), as.character(tf2ensg$ENSEMBL))] |
|
679 | 679 |
n_min <- sapply(split(n_min,names(n_min)),sum) |
680 | 680 |
|
681 | 681 |
# Make sure n_min and diffDenityMat are compatible because some NA rows may have been filtered out for diffDensityMat |
... | ... |
@@ -336,7 +336,7 @@ |
336 | 336 |
hashesStrError = paste0(paste0(rep("#", nchar(lastPartError) - 1), collapse = ""), "\n") |
337 | 337 |
messageError = paste0(objectPart, checkResult, "\n\n", hashesStrError, lastPartError, hashesStrError) |
338 | 338 |
|
339 |
- lastPartWarning = ". \nThis warning may or may not be ignored. Carefully check its significance and whether it may affect the results." |
|
339 |
+ lastPartWarning = ". \nThis warning may or may not be ignored. Carefully check its significance and whether it may affect the results.\n" |
|
340 | 340 |
#hashesStrWarning = paste0(paste0(rep("#", nchar(lastPartWarning) - 1), collapse = ""), "\n") |
341 | 341 |
messageWarning = paste0(objectPart, checkResult, lastPartWarning) # , hashesStrWarning) |
342 | 342 |
|
... | ... |
@@ -293,13 +293,13 @@ performAllNetworkAnalyses <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
293 | 293 |
|
294 | 294 |
} else if (type == "all_RNA") { |
295 | 295 |
|
296 |
- backgroundGenes = GRN@data$RNA$counts_norm.l[["0"]] %>% |
|
296 |
+ backgroundGenes = GRN@data$RNA$counts_metadata %>% |
|
297 | 297 |
dplyr::pull(.data$ENSEMBL) |
298 | 298 |
|
299 | 299 |
|
300 | 300 |
} else if (type == "all_RNA_filtered") { |
301 | 301 |
|
302 |
- backgroundGenes = GRN@data$RNA$counts_norm.l[["0"]] %>% |
|
302 |
+ backgroundGenes = GRN@data$RNA$counts_metadata %>% |
|
303 | 303 |
dplyr::filter(!.data$isFiltered) %>% |
304 | 304 |
dplyr::pull(.data$ENSEMBL) |
305 | 305 |
|
... | ... |
@@ -381,13 +381,13 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
381 | 381 |
checkmate::assertChoice(pAdjustMethod, c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")) |
382 | 382 |
checkmate::assertFlag(forceRerun) |
383 | 383 |
|
384 |
- |
|
384 |
+ futile.logger::flog.info(paste0("Calculating general enrichment. This may take a while")) |
|
385 | 385 |
|
386 | 386 |
|
387 | 387 |
mapping = .getGenomeObject(GRN@config$parameters$genomeAssembly, type = "packageName") |
388 | 388 |
backgroundGenes = .getBackgroundGenes(GRN, type = background, gene.types = background_geneTypes) |
389 | 389 |
|
390 |
- futile.logger::flog.info(paste0("Calculating general enrichment statistics. This may take a while")) |
|
390 |
+ |
|
391 | 391 |
|
392 | 392 |
if (is.null(GRN@stats[["Enrichment"]][["general"]])) { |
393 | 393 |
GRN@stats[["Enrichment"]][["general"]] = list() |
... | ... |
@@ -480,7 +480,7 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
480 | 480 |
|
481 | 481 |
} |
482 | 482 |
|
483 |
-.checkEnrichmentCongruence_general_community <-function(GRN, type = "community") { |
|
483 |
+.checkEnrichmentCongruence_general <-function(GRN, type = "community") { |
|
484 | 484 |
|
485 | 485 |
allOntologiesGeneral = sort(names(GRN@stats$Enrichment$general)) |
486 | 486 |
|
... | ... |
@@ -491,14 +491,21 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
491 | 491 |
} |
492 | 492 |
|
493 | 493 |
if (!identical(allOntologiesGeneral, allOntologiesGroup1)) { |
494 |
- message = paste0("General enrichment and ", type, " enrichment do not have the same ontologies precalculated (", |
|
495 |
- paste0(allOntologiesGeneral, collapse = " & "), " vs. ", paste0(allOntologiesGroup1, collapse = "&"), ")") |
|
496 |
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE) |
|
494 |
+ message = paste0("General enrichment and ", type, " enrichment do not have the same ontologies precalculated (\"", |
|
495 |
+ paste0(allOntologiesGeneral, collapse = " & "), "\" vs. \"", paste0(allOntologiesGroup1, collapse = "&"), "\").", |
|
496 |
+ "Run the function calculateGeneralEnrichment to eliminate this warning") |
|
497 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) |
|
497 | 498 |
} |
498 |
- allOntologiesGeneral |
|
499 |
+ |
|
500 |
+ res.l = list(allOntologiesGroup1, allOntologiesGeneral) |
|
501 |
+ names(res.l) = c(type, "all") |
|
502 |
+ |
|
503 |
+ res.l |
|
499 | 504 |
|
500 | 505 |
} |
501 | 506 |
|
507 |
+ |
|
508 |
+ |
|
502 | 509 |
#' @importFrom biomaRt useEnsembl getBM |
503 | 510 |
.runEnrichment <- function(GRN, foreground, background, backgroundStr, ontology, |
504 | 511 |
description = "Enrichment Analysis", |
... | ... |
@@ -584,6 +591,19 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
584 | 591 |
|
585 | 592 |
|
586 | 593 |
if (ontology %in% c("GO_BP","GO_MF","GO_CC")){ |
594 |
+ |
|
595 |
+ # go_enrichment = |
|
596 |
+ # clusterProfiler::enrichGO( |
|
597 |
+ # gene = foreground_entrez, |
|
598 |
+ # OrgDb = 'org.Hs.eg.db', |
|
599 |
+ # ont = sub("GO_", "", ontology), |
|
600 |
+ # universe = background_entrez, |
|
601 |
+ # keyType = "ENTREZID", |
|
602 |
+ # pvalueCutoff = 1, |
|
603 |
+ # qvalueCutoff = 1, |
|
604 |
+ # minGSSize = minGSSize, |
|
605 |
+ # maxGSSize = maxGSSize, |
|
606 |
+ # pAdjustMethod = pAdjustMethod) |
|
587 | 607 |
|
588 | 608 |
go_enrichment = suppressMessages(new("topGOdata", |
589 | 609 |
ontology = gsub("GO_", "", ontology), |
... | ... |
@@ -594,12 +614,23 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
594 | 614 |
mapping = mapping, |
595 | 615 |
ID = "ensembl")) |
596 | 616 |
|
617 |
+ # retrieve genes2GO list from the "expanded" annotation in GOdata |
|
618 |
+ allGO = topGO::genesInTerm(go_enrichment) |
|
619 |
+ allGO_inForeground = lapply(allGO, function(x) paste0(x[x %in% foreground], collapse = ",")) %>% |
|
620 |
+ as.data.frame() %>% t() |
|
621 |
+ |
|
622 |
+ allGO_inForeground.df = tibble::tibble(ID = rownames(allGO_inForeground), gene.ENSEMBL_foreground = allGO_inForeground[, 1]) %>% |
|
623 |
+ dplyr::mutate(ID = sub(".", ":", .data$ID, fixed = TRUE)) |
|
624 |
+ |
|
625 |
+ |
|
626 |
+ |
|
597 | 627 |
result = suppressMessages(topGO::runTest(go_enrichment, algorithm = algorithm, statistic = statistic)) |
598 | 628 |
# Dont trim GO terms here, happens later when plotting |
599 | 629 |
result.tbl = unique(topGO::GenTable(go_enrichment, pval = result, orderBy = "pval", numChar = 1000, |
600 | 630 |
topNodes = length(topGO::score(result))) ) %>% |
601 | 631 |
dplyr::rename(ID = .data$GO.ID, Found = .data$Significant) %>% # make it more clear what Significant refers to here |
602 |
- dplyr::mutate(GeneRatio = .data$Found / nForeground) |
|
632 |
+ dplyr::mutate(GeneRatio = .data$Found / nForeground) %>% |
|
633 |
+ dplyr::left_join(allGO_inForeground.df, by = "ID") |
|
603 | 634 |
|
604 | 635 |
|
605 | 636 |
result.list[["results"]] = result.tbl |
... | ... |
@@ -727,8 +758,10 @@ calculateGeneralEnrichment <- function(GRN, ontology = c("GO_BP", "GO_MF"), |
727 | 758 |
if (!is.null(enrichmentObj)) { |
728 | 759 |
|
729 | 760 |
result.tbl = enrichmentObj@result %>% |
730 |
- dplyr::rename(Term = .data$Description, pval = .data$pvalue, Found = .data$Count) %>% |
|
731 |
- dplyr::mutate(GeneRatio = sapply(parse (text = enrichmentObj@result$GeneRatio), eval)) |
|
761 |
+ dplyr::rename(Term = .data$Description, pval = .data$pvalue, Found = .data$Count) |
|
762 |
+ |
|
763 |
+ # GeneRatio and BgRatio are reported as fractions like 5/83, leave it like this for now |
|
764 |
+ # %>% dplyr::mutate(GeneRatio = sapply(parse (text = enrichmentObj@result$GeneRatio), eval)) |
|
732 | 765 |
|
733 | 766 |
} else { |
734 | 767 |
|
... | ... |
@@ -1153,6 +1186,8 @@ calculateTFEnrichment <- function(GRN, rankType = "degree", n = 3, TF.names = NU |
1153 | 1186 |
checkmate::assertChoice(pAdjustMethod, c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")) |
1154 | 1187 |
checkmate::assertFlag(forceRerun) |
1155 | 1188 |
|
1189 |
+ futile.logger::flog.info(paste0("Calculating TF enrichment. This may take a while")) |
|
1190 |
+ |
|
1156 | 1191 |
if (rankType == "custom"){ |
1157 | 1192 |
if(is.null(TF.names)){ |
1158 | 1193 |
futile.logger::flog.error("To calculate the GO enrichment for a custom set of TFs, you must provide the TF names in the 'TF.names' parameter.") |
1159 | 1194 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,75 @@ |
1 |
+[2022-08-24 10:29:01] [INFO] [OmnipathR] Initialized cache: `/home/christian/.cache/OmnipathR`. |
|
2 |
+[2022-08-24 10:29:01] [INFO] [OmnipathR] Welcome to OmnipathR! |
|
3 |
+[2022-08-24 10:29:05] [INFO] [OmnipathR] Cache record does not exist: `https://omnipathdb.org/annotations?resources=PROGENy&license=academic` |
|
4 |
+[2022-08-24 10:29:05] [INFO] [OmnipathR] Retrieving URL: `https://omnipathdb.org/annotations?resources=PROGENy&license=academic` |
|
5 |
+[2022-08-24 10:29:05] [TRACE] [OmnipathR] Attempt 1/3: `https://omnipathdb.org/annotations?resources=PROGENy&license=academic` |
|
6 |
+[2022-08-24 10:29:09] [INFO] [OmnipathR] Cache item `3492392585ba05414edfae46801492076437d7e7` version 1: status changed from `unknown` to `started`. |
|
7 |
+[2022-08-24 10:29:09] [TRACE] [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/3492392585ba05414edfae46801492076437d7e7-1.rds`. |
|
8 |
+[2022-08-24 10:29:12] [TRACE] [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/3492392585ba05414edfae46801492076437d7e7-1.rds`. |
|
9 |
+[2022-08-24 10:29:12] [INFO] [OmnipathR] Download ready [key=3492392585ba05414edfae46801492076437d7e7, version=1] |
|
10 |
+[2022-08-24 10:29:12] [INFO] [OmnipathR] Cache item `3492392585ba05414edfae46801492076437d7e7` version 1: status changed from `started` to `ready`. |
|
11 |
+[2022-08-24 10:29:12] [SUCCESS] [OmnipathR] Downloaded 700257 annotation records. |
|
12 |
+[2022-08-24 10:29:14] [INFO] [OmnipathR] Loading database `Ensembl organism names`. |
|
13 |
+[2022-08-24 10:29:14] [INFO] [OmnipathR] Looking up in cache `https://www.ensembl.org/info/about/species.html`: key=7332486db7400730697234bad76ca0c8e4d00799, no version available. |
|
14 |
+[2022-08-24 10:29:14] [INFO] [OmnipathR] Created new version for cache record 7332486db7400730697234bad76ca0c8e4d00799: version 1. |
|
15 |
+[2022-08-24 10:29:14] [TRACE] [OmnipathR] Cache file path: /home/christian/.cache/OmnipathR/7332486db7400730697234bad76ca0c8e4d00799-1.html |
|
16 |
+[2022-08-24 10:29:14] [INFO] [OmnipathR] Retrieving URL: `https://www.ensembl.org/info/about/species.html` |
|
17 |
+[2022-08-24 10:29:14] [TRACE] [OmnipathR] Attempt 1/3: `https://www.ensembl.org/info/about/species.html` |
|
18 |
+[2022-08-24 10:29:14] [TRACE] [OmnipathR] HTTP 200 |
|
19 |
+[2022-08-24 10:29:14] [INFO] [OmnipathR] Download ready [key=7332486db7400730697234bad76ca0c8e4d00799, version=1] |
|
20 |
+[2022-08-24 10:29:14] [INFO] [OmnipathR] Cache item `7332486db7400730697234bad76ca0c8e4d00799` version 1: status changed from `unknown` to `ready`. |
|
21 |
+[2022-08-24 10:29:15] [INFO] [OmnipathR] Loaded database `Ensembl organism names`. |
|
22 |
+[2022-08-24 10:29:15] [TRACE] [OmnipathR] Looking up in cache: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data`. |
|
23 |
+[2022-08-24 10:29:15] [INFO] [OmnipathR] Cache record does not exist: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data` |
|
24 |
+[2022-08-24 10:29:15] [TRACE] [OmnipathR] Could not find in cache, initiating download: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data`. |
|
25 |
+[2022-08-24 10:29:15] [INFO] [OmnipathR] Cache item `2ccc1998f0ab26ea421885e1334896d25db4554f` version 1: status changed from `unknown` to `started`. |
|
26 |
+[2022-08-24 10:29:15] [TRACE] [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/2ccc1998f0ab26ea421885e1334896d25db4554f-1.rds`. |
|
27 |
+[2022-08-24 10:29:15] [INFO] [OmnipathR] Retrieving URL: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data` |
|
28 |
+[2022-08-24 10:29:15] [TRACE] [OmnipathR] Attempt 1/3: `https://ftp.ncbi.nih.gov/pub/HomoloGene/current/homologene.data` |
|
29 |
+[2022-08-24 10:29:28] [TRACE] [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/2ccc1998f0ab26ea421885e1334896d25db4554f-1.rds`. |
|
30 |
+[2022-08-24 10:29:28] [INFO] [OmnipathR] Download ready [key=2ccc1998f0ab26ea421885e1334896d25db4554f, version=1] |
|
31 |
+[2022-08-24 10:29:28] [INFO] [OmnipathR] Cache item `2ccc1998f0ab26ea421885e1334896d25db4554f` version 1: status changed from `started` to `ready`. |
|
32 |
+[2022-08-24 10:29:28] [SUCCESS] [OmnipathR] NCBI HomoloGene (ftp.ncbi.nih.gov): downloaded 275237 records |
|
33 |
+[2022-08-24 10:29:28] [TRACE] [OmnipathR] ID translation table: from `genesymbol` to `uniprot`, using `uniprot`. |
|
34 |
+[2022-08-24 10:29:28] [TRACE] [OmnipathR] Creating ID mapping table from `genes(PREFERRED)` to `id`, for organism 9606 (only reviewed: TRUE) |
|
35 |
+[2022-08-24 10:29:28] [TRACE] [OmnipathR] Loading all UniProt records for organism 9606 (only reviewed: TRUE); fields: genes(PREFERRED),id |
|
36 |
+[2022-08-24 10:29:28] [TRACE] [OmnipathR] Looking up in cache: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no`. |
|
37 |
+[2022-08-24 10:29:28] [INFO] [OmnipathR] Cache record does not exist: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no` |
|
38 |
+[2022-08-24 10:29:28] [TRACE] [OmnipathR] Could not find in cache, initiating download: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no`. |
|
39 |
+[2022-08-24 10:29:28] [INFO] [OmnipathR] Cache item `5745c7e161d60f14153a3762e978cf882b30bf01` version 1: status changed from `unknown` to `started`. |
|
40 |
+[2022-08-24 10:29:28] [TRACE] [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/5745c7e161d60f14153a3762e978cf882b30bf01-1.rds`. |
|
41 |
+[2022-08-24 10:29:28] [INFO] [OmnipathR] Retrieving URL: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no` |
|
42 |
+[2022-08-24 10:29:28] [TRACE] [OmnipathR] Attempt 1/3: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:9606%20AND%20reviewed:yes&compress=no` |
|
43 |
+[2022-08-24 10:29:35] [TRACE] [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/5745c7e161d60f14153a3762e978cf882b30bf01-1.rds`. |
|
44 |
+[2022-08-24 10:29:35] [INFO] [OmnipathR] Download ready [key=5745c7e161d60f14153a3762e978cf882b30bf01, version=1] |
|
45 |
+[2022-08-24 10:29:35] [INFO] [OmnipathR] Cache item `5745c7e161d60f14153a3762e978cf882b30bf01` version 1: status changed from `started` to `ready`. |
|
46 |
+[2022-08-24 10:29:35] [SUCCESS] [OmnipathR] UniProt (legacy.uniprot.org): downloaded 20398 records |
|
47 |
+[2022-08-24 10:29:36] [TRACE] [OmnipathR] ID translation table: from `genesymbol` to `uniprot`, using `uniprot`. |
|
48 |
+[2022-08-24 10:29:36] [TRACE] [OmnipathR] Creating ID mapping table from `genes(PREFERRED)` to `id`, for organism 10116 (only reviewed: TRUE) |
|
49 |
+[2022-08-24 10:29:36] [TRACE] [OmnipathR] Loading all UniProt records for organism 10116 (only reviewed: TRUE); fields: genes(PREFERRED),id |
|
50 |
+[2022-08-24 10:29:36] [TRACE] [OmnipathR] Looking up in cache: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no`. |
|
51 |
+[2022-08-24 10:29:36] [INFO] [OmnipathR] Cache record does not exist: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no` |
|
52 |
+[2022-08-24 10:29:36] [TRACE] [OmnipathR] Could not find in cache, initiating download: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no`. |
|
53 |
+[2022-08-24 10:29:36] [INFO] [OmnipathR] Cache item `bd4863670563624f4aff5469966f0ef06833d425` version 1: status changed from `unknown` to `started`. |
|
54 |
+[2022-08-24 10:29:36] [TRACE] [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/bd4863670563624f4aff5469966f0ef06833d425-1.rds`. |
|
55 |
+[2022-08-24 10:29:36] [INFO] [OmnipathR] Retrieving URL: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no` |
|
56 |
+[2022-08-24 10:29:36] [TRACE] [OmnipathR] Attempt 1/3: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=genes(PREFERRED),id&fil=organism:10116%20AND%20reviewed:yes&compress=no` |
|
57 |
+[2022-08-24 10:29:47] [TRACE] [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/bd4863670563624f4aff5469966f0ef06833d425-1.rds`. |
|
58 |
+[2022-08-24 10:29:47] [INFO] [OmnipathR] Download ready [key=bd4863670563624f4aff5469966f0ef06833d425, version=1] |
|
59 |
+[2022-08-24 10:29:47] [INFO] [OmnipathR] Cache item `bd4863670563624f4aff5469966f0ef06833d425` version 1: status changed from `started` to `ready`. |
|
60 |
+[2022-08-24 10:29:47] [SUCCESS] [OmnipathR] UniProt (legacy.uniprot.org): downloaded 8160 records |
|
61 |
+[2022-08-24 10:30:10] [TRACE] [OmnipathR] ID translation table: from `uniprot` to `genesymbol`, using `uniprot`. |
|
62 |
+[2022-08-24 10:30:10] [TRACE] [OmnipathR] Creating ID mapping table from `id` to `genes(PREFERRED)`, for organism 10116 (only reviewed: TRUE) |
|
63 |
+[2022-08-24 10:30:10] [TRACE] [OmnipathR] Loading all UniProt records for organism 10116 (only reviewed: TRUE); fields: id,genes(PREFERRED) |
|
64 |
+[2022-08-24 10:30:10] [TRACE] [OmnipathR] Looking up in cache: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no`. |
|
65 |
+[2022-08-24 10:30:10] [INFO] [OmnipathR] Cache record does not exist: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no` |
|
66 |
+[2022-08-24 10:30:10] [TRACE] [OmnipathR] Could not find in cache, initiating download: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no`. |
|
67 |
+[2022-08-24 10:30:10] [INFO] [OmnipathR] Cache item `13e562cc918b5226343ec314c33bee5c6e514e1f` version 1: status changed from `unknown` to `started`. |
|
68 |
+[2022-08-24 10:30:10] [TRACE] [OmnipathR] Exporting object to RDS: `/home/christian/.cache/OmnipathR/13e562cc918b5226343ec314c33bee5c6e514e1f-1.rds`. |
|
69 |
+[2022-08-24 10:30:10] [INFO] [OmnipathR] Retrieving URL: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no` |
|
70 |
+[2022-08-24 10:30:10] [TRACE] [OmnipathR] Attempt 1/3: `https://legacy.uniprot.org/uniprot/?query=*&format=tab&force=true&columns=id,genes(PREFERRED)&fil=organism:10116%20AND%20reviewed:yes&compress=no` |
|
71 |
+[2022-08-24 10:30:12] [TRACE] [OmnipathR] Exported RDS to `/home/christian/.cache/OmnipathR/13e562cc918b5226343ec314c33bee5c6e514e1f-1.rds`. |
|
72 |
+[2022-08-24 10:30:12] [INFO] [OmnipathR] Download ready [key=13e562cc918b5226343ec314c33bee5c6e514e1f, version=1] |
|
73 |
+[2022-08-24 10:30:12] [INFO] [OmnipathR] Cache item `13e562cc918b5226343ec314c33bee5c6e514e1f` version 1: status changed from `started` to `ready`. |
|
74 |
+[2022-08-24 10:30:12] [SUCCESS] [OmnipathR] UniProt (legacy.uniprot.org): downloaded 8160 records |
|
75 |
+[2022-08-24 13:30:12] [TRACE] [OmnipathR] Removing database `Ensembl organism names` (not used in the past 10800 seconds) |
... | ... |
@@ -4,6 +4,7 @@ utils::globalVariables("where") |
4 | 4 |
|
5 | 5 |
############### PCA ############### |
6 | 6 |
|
7 |
+# TODO: Currently, the "standard PCA workflow with vst transform based on raw counts is not possible anymore, the normalized counts are taken as they are |
|
7 | 8 |
|
8 | 9 |
#' Produce a PCA plot of the data from a \code{\linkS4class{GRN}} object |
9 | 10 |
#' |
... | ... |
@@ -11,7 +12,8 @@ utils::globalVariables("where") |
11 | 12 |
#' @template outputFolder |
12 | 13 |
#' @param data Character. Either \code{"peaks"} or \code{"rna"} or \code{"all"}. Default \code{c("rna", "peaks")}. Type of data to plot a PCA for. \code{"peaks"} corresponds to the the open chromatin data, while \code{"rna"} refers to the RNA-seq counts. If set to \code{"all"}, PCA will be done for both data modalities. In any case, PCA will be based on the original provided data before any additional normalization has been run (i.e., usually the raw data). |
13 | 14 |
#' @param topn Integer vector. Default \code{c(500,1000,5000)}. Number of top variable features to do PCA for. Can be a vector of different numbers (see default). |
14 |
-#' @param type Character. One of or a combination of \code{"raw"}, \code{"normalized"}, \code{"all"}. Default \code{c("raw", "normalized")}. Should the PCA plots be done based on the raw or normalized data, respectively? |
|
15 |
+#' @param type Character. Must be \code{"normalized"}. On which data type (raw or normalized) should the PCA plots be done? We removed support for \code{raw} and currently only support \code{normalized}. |
|
16 |
+#' @param removeFiltered Logical. \code{TRUE} or \code{FALSE}. Default \code{TRUE}. Should features marked as filtered as determined by \code{filterData} be removed? |
|
15 | 17 |
#' @template forceRerun |
16 | 18 |
#' @template basenameOutput |
17 | 19 |
#' @template plotAsPDF |
... | ... |
@@ -22,10 +24,10 @@ utils::globalVariables("where") |
22 | 24 |
#' @examples |
23 | 25 |
#' # See the Workflow vignette on the GRaNIE website for examples |
24 | 26 |
#' GRN = loadExampleObject() |
25 |
-#' GRN = plotPCA_all(GRN, topn = 500, data = "rna", type = "raw", plotAsPDF = FALSE, pages = 1) |
|
27 |
+#' GRN = plotPCA_all(GRN, topn = 500, data = "rna", type = "normalized", plotAsPDF = FALSE, pages = 1) |
|
26 | 28 |
#' @export |
27 | 29 |
plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
28 |
- data = c("rna", "peaks"), topn = c(500,1000,5000), type = c("raw", "normalized"), |
|
30 |
+ data = c("rna", "peaks"), topn = c(500,1000,5000), type = "normalized", removeFiltered = TRUE, |
|
29 | 31 |
plotAsPDF = TRUE, pdf_width = 12, pdf_height = 12, pages = NULL, |
30 | 32 |
forceRerun = FALSE) { |
31 | 33 |
|
... | ... |
@@ -36,7 +38,9 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
36 | 38 |
GRN = .makeObjectCompatible(GRN) |
37 | 39 |
|
38 | 40 |
checkmate::assertSubset(data , c("rna", "peaks", "all"), empty.ok = FALSE) |
39 |
- checkmate::assertSubset(type , c("raw", "normalized", "all"), empty.ok = FALSE) |
|
41 |
+ checkmate::assertChoice(type , c("normalized")) |
|
42 |
+ #checkmate::assertSubset(type , c("raw", "normalized", "all"), empty.ok = FALSE) |
|
43 |
+ checkmate::assertFlag(removeFiltered) |
|
40 | 44 |
checkmate::assertFlag(plotAsPDF) |
41 | 45 |
checkmate::assertNumeric(pdf_width, lower = 5, upper = 99) |
42 | 46 |
checkmate::assertNumeric(pdf_height, lower = 5, upper = 99) |
... | ... |
@@ -56,106 +60,53 @@ plotPCA_all <- function(GRN, outputFolder = NULL, basenameOutput = NULL, |
56 | 60 |
# Initialize the page counter |
57 | 61 |
pageCounter = 1 |
58 | 62 |
|
63 |
+ # For normalized data, we do not do any other transformation currently and plot the normalized data "as is" |
|
64 |
+ # Changed on 08.02.22: We now never do a(nother) log transform, as the input for the density plot is already transformed counts. |
|
65 |
+ transformationCur = "none" |
|
66 |
+ logTransformDensity = FALSE |
|
67 |
+ |
|
59 | 68 |
if ("rna" %in% data | "all" %in% data) { |
60 | 69 |
|
61 |
- # Check if we have raw integer counts. Ust vst transform if, otherwise "regular log2 |
|
62 |
- transformation = dplyr::if_else(checkmate::testClass(GRN@data$RNA$counts_orig, "DESeqDataSet"), "vst", "log2") |
|
63 |
- |
|
64 |
- normVector = c("raw", "normalized") |
|
65 |
- # If data is pre-normalized, dont do any transformation, as "raw" and "normalized" are supposed to be identical |
|
66 |
- if (GRN@config$parameters$normalization_rna == "none") { |
|
67 |
- normVector = c("normalized") |
|
68 |
- } |
|
69 |
- |
|
70 |
- for (norm in normVector) { |
|
71 |
- |
|
72 |
- if (!(norm %in% type | "all" %in% type)) { |
|
73 |
- next |
|
74 |
- } |
|
75 |
- |
|
76 |
- # Only do a transformation if this is raw counts, already normalized counts dont need another transformation |
|
77 |
- transformationCur = dplyr::if_else(norm == "raw", transformation, "none") |
|
78 |
- |
|
79 |
- # Changed on 08.02.22: We now never do a(nother) log transform, as the input for the density plot is already transformed counts. |
|
80 |
- # logTransformDensity = dplyr::if_else(norm == "raw", TRUE, FALSE) |
|
81 |
- logTransformDensity = FALSE |
|
82 |
- |
|
83 |
- # Get filtered rows from normalized version always, this is correct, as isFiltered is missing in raw counts |
|
84 |
- nonFilteredRows = which(! GRN@data$RNA$counts_norm.l[["0"]]$isFiltered) |
|
85 |
- |
|
86 |
- if (norm == "raw") { |
|
87 |
- |
|
88 |
- matrixCur = GRN@data$RNA$counts_orig[nonFilteredRows, uniqueSamples] |
|
89 |
- |
|
90 |
- } else { |
|
91 |
- |
|
92 |
- matrixCur = GRN@data$RNA$counts_norm.l[["0"]][nonFilteredRows, uniqueSamples] %>% as.matrix() |
|
93 |
- rownames(matrixCur) = GRN@data$RNA$counts_norm.l[["0"]]$ENSEMBL[nonFilteredRows] |
|
94 |
- } |
|
95 |
- |
|
96 |
- fileCur = paste0(outputFolder, |
|
97 |
- dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_pca"), basenameOutput), |
|
98 |
- "_RNA.", norm, ".pdf") |
|
99 |
- if (!file.exists(fileCur) | forceRerun) { |
|
70 |
+ matrixCur = getCounts(GRN, type = "rna", asMatrix = TRUE, includeFiltered = !removeFiltered) |
|
71 |
+ |
|
72 |
+ fileCur = paste0(outputFolder, |
|
73 |
+ dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_pca"), basenameOutput), |
|
74 |
+ "_RNA.", type, ".pdf") |
|
75 |
+ if (!file.exists(fileCur) | forceRerun) { |
|
100 | 76 |
|
101 |
- futile.logger::flog.info(paste0("Plotting PCA and metadata correlation of ", norm, |
|
77 |
+ futile.logger::flog.info(paste0("Plotting PCA and metadata correlation of ", type, |
|
102 | 78 |
" RNA data for all shared samples to file ", fileCur , |
103 | 79 |
"... This may take a few minutes")) |
104 | 80 |
|
105 | 81 |
.plot_PCA_wrapper(GRN, matrixCur, transformation = transformationCur, logTransformDensity = logTransformDensity, file = fileCur, topn = topn, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, plotAsPDF = plotAsPDF) |
106 | 82 |
|
107 |
- } else { |
|
83 |
+ } else { |
|
108 | 84 |
futile.logger::flog.info(paste0("File ", fileCur, " already exists, not overwriting due to forceRerun = FALSE")) |
109 | 85 |
|
110 |
- } |
|
111 | 86 |
} |
112 | 87 |
|
88 |
+ |
|
113 | 89 |
} # end if plot RNA PCA |
114 | 90 |
|
115 | 91 |
if ("peaks" %in% data | "all" %in% data) { |
116 | 92 |
|
117 |
- # Check if we have raw integer counts |
|
118 |
- transformation = dplyr::if_else(checkmate::testClass(GRN@data$peaks$counts_orig, "DESeqDataSet"), "vst", "log2") |
|
93 |
+ matrixCur = getCounts(GRN, type = "peaks", asMatrix = TRUE, includeFiltered = !removeFiltered) |
|
119 | 94 |
|
120 |
- normVector = c("raw", "normalized") |
|
121 |
- # If data is pre-normalized, dont do any transformation, as "raw" and "normalized" are supposed to be identical |
|
122 |
- if (GRN@config$parameters$normalization_peaks == "none") { |
|
123 |
- normVector = c("normalized") |
|
124 |
- } |
|
95 |
+ fileCur = paste0(outputFolder, dplyr::if_else(is.null(basenameOutput), |
|
96 |
+ .getOutputFileName("plot_pca"), basenameOutput), "_peaks.", type, ".pdf") |
|
125 | 97 |
|
126 |
- for (norm in normVector) { |
|
127 |
- |
|
128 |
- transformationCur = dplyr::if_else(norm == "raw", transformation, "none") |
|
129 |
- logTransformDensity = dplyr::if_else(norm == "raw", TRUE, FALSE) |
|
130 |
- |
|
131 |
- nonFilteredRows = which(! GRN@data$peaks$counts_norm$isFiltered) |
|
132 |
- |
|
133 |
- if (norm == "raw") { |
|
134 |
- |
|
135 |
- matrixCur = GRN@data$peaks$counts_orig[nonFilteredRows, uniqueSamples] |
|
98 |
+ if (!file.exists(fileCur) | forceRerun) { |
|
136 | 99 |
|
137 |
- } else { |
|
138 |
- |
|
139 |
- matrixCur = GRN@data$peaks$counts_norm[nonFilteredRows, uniqueSamples] %>% as.matrix() |
|
140 |
- rownames(matrixCur) = GRN@data$peaks$counts_norm$peakID[nonFilteredRows] |
|
141 |
- } |
|
142 |
- |
|
143 |
- fileCur = paste0(outputFolder, dplyr::if_else(is.null(basenameOutput), .getOutputFileName("plot_pca"), basenameOutput), "_peaks.", norm, ".pdf") |
|
144 |
- |
|
145 |
- if (!file.exists(fileCur) | forceRerun) { |
|
146 |
- |
|
147 |
- futile.logger::flog.info(paste0("Plotting PCA and metadata correlation of ", norm, |
|
100 |
+ futile.logger::flog.info(paste0("Plotting PCA and metadata correlation of ", type, |
|
148 | 101 |
" peaks data for all shared samples to file ", fileCur , |
149 | 102 |
"... This may take a few minutes")) |
150 | 103 |
|
151 | 104 |
.plot_PCA_wrapper(GRN, matrixCur, transformation = transformationCur, logTransformDensity = logTransformDensity, file = fileCur, topn = topn, pdf_width = pdf_width, pdf_height = pdf_height, pages = pages, plotAsPDF = plotAsPDF) |
152 | 105 |
|
153 |
- } else { |
|
106 |
+ } else { |
|
154 | 107 |
futile.logger::flog.info(paste0("File ", fileCur, " already exists, not overwriting due to forceRerun = FALSE")) |
155 | 108 |
|
156 |
- } |
|
157 |
- |
|
158 |
- } # end for raw and norm |
|
109 |
+ } |
|
159 | 110 |
|
160 | 111 |
} |
161 | 112 |
|
... | ... |
@@ -934,7 +885,7 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
934 | 885 |
|
935 | 886 |
|
936 | 887 |
# For compatibility reasons, re-create the genes and peaks annotation if not present in the object |
937 |
- if (! "peak.annotation" %in% colnames(GRN@annotation$consensusPeaks)) { |
|
888 |
+ if (! "peak.annotation" %in% colnames(GRN@annotation$peaks)) { |
|
938 | 889 |
GRN = .populatePeakAnnotation(GRN) |
939 | 890 |
} |
940 | 891 |
if (! "gene.CV" %in% colnames(GRN@annotation$genes)) { |
... | ... |
@@ -1038,7 +989,7 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1038 |