... |
... |
@@ -16,10 +16,10 @@
|
16 |
16 |
initializeGRN <- function(objectMetadata = list(),
|
17 |
17 |
outputFolder,
|
18 |
18 |
genomeAssembly) {
|
19 |
|
-
|
|
19 |
+
|
20 |
20 |
checkmate::assert(checkmate::checkNull(objectMetadata), checkmate::checkList(objectMetadata))
|
21 |
21 |
checkmate::assertSubset(genomeAssembly, c("hg19","hg38", "mm10"))
|
22 |
|
-
|
|
22 |
+
|
23 |
23 |
# Create the folder first if not yet existing
|
24 |
24 |
if (!dir.exists(outputFolder)) {
|
25 |
25 |
dir.create(outputFolder)
|
... |
... |
@@ -44,7 +44,7 @@ initializeGRN <- function(objectMetadata = list(),
|
44 |
44 |
GRN = .addFunctionLogToObject(GRN)
|
45 |
45 |
|
46 |
46 |
GRN@config$isFiltered = FALSE
|
47 |
|
-
|
|
47 |
+
|
48 |
48 |
par.l = list()
|
49 |
49 |
|
50 |
50 |
packageName = utils::packageName()
|
... |
... |
@@ -80,7 +80,7 @@ initializeGRN <- function(objectMetadata = list(),
|
80 |
80 |
GRN@config$directories$outputRoot = outputFolder
|
81 |
81 |
GRN@config$directories$output_plots = dir_output_plots
|
82 |
82 |
GRN@config$files$output_log = paste0(outputFolder, "GRN.log")
|
83 |
|
-
|
|
83 |
+
|
84 |
84 |
.testExistanceAndCreateDirectoriesRecursively(c(outputFolder, dir_output_plots))
|
85 |
85 |
|
86 |
86 |
checkmate::assertDirectory(outputFolder, access = "w")
|
... |
... |
@@ -125,7 +125,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
125 |
125 |
counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
|
126 |
126 |
allowOverlappingPeaks= FALSE,
|
127 |
127 |
forceRerun = FALSE) {
|
128 |
|
-
|
|
128 |
+
|
129 |
129 |
GRN = .addFunctionLogToObject(GRN)
|
130 |
130 |
|
131 |
131 |
checkmate::assertClass(GRN, "GRN")
|
... |
... |
@@ -142,7 +142,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
142 |
142 |
forceRerun) {
|
143 |
143 |
|
144 |
144 |
# Store raw peaks counts as DESeq object
|
145 |
|
-
|
|
145 |
+
|
146 |
146 |
#GRN@data$peaks$counts_raw = counts_peaks %>% dplyr::mutate(isFiltered = FALSE)
|
147 |
147 |
|
148 |
148 |
# Normalize ID column names
|
... |
... |
@@ -152,31 +152,31 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
152 |
152 |
if (idColumn_RNA != "ENSEMBL") {
|
153 |
153 |
counts_rna = dplyr::rename(counts_rna, ENSEMBL = !!(idColumn_RNA))
|
154 |
154 |
}
|
155 |
|
-
|
|
155 |
+
|
156 |
156 |
# Check ID columns for missing values and remove
|
157 |
157 |
rna_missing_ID = which(is.na(counts_rna$ENSEMBL))
|
158 |
158 |
if (length(rna_missing_ID) > 0) {
|
159 |
|
- message = paste0(" Found ", length(rna_missing_ID), " missing IDs in the ID column of the RNA counts. These rows will be removed.")
|
160 |
|
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
161 |
|
- counts_rna = dplyr::slice(counts_rna, -rna_missing_ID)
|
|
159 |
+ message = paste0(" Found ", length(rna_missing_ID), " missing IDs in the ID column of the RNA counts. These rows will be removed.")
|
|
160 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
|
161 |
+ counts_rna = dplyr::slice(counts_rna, -rna_missing_ID)
|
162 |
162 |
}
|
163 |
163 |
|
164 |
164 |
peaks_missing_ID = which(is.na(counts_peaks$peakID))
|
165 |
165 |
if (length(peaks_missing_ID) > 0) {
|
166 |
|
- message = paste0(" Found ", length(peaks_missing_ID), " missing IDs in the ID column of the peaks counts. These rows will be removed.")
|
167 |
|
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
168 |
|
- counts_peaks = dplyr::slice(counts_peaks, -peaks_missing_ID)
|
|
166 |
+ message = paste0(" Found ", length(peaks_missing_ID), " missing IDs in the ID column of the peaks counts. These rows will be removed.")
|
|
167 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
|
168 |
+ counts_peaks = dplyr::slice(counts_peaks, -peaks_missing_ID)
|
169 |
169 |
}
|
170 |
170 |
|
171 |
|
-
|
|
171 |
+
|
172 |
172 |
# Remove potential scientific notation from peak IDs
|
173 |
173 |
peaks_eNotation = which(grepl("e+", counts_peaks$peakID))
|
174 |
174 |
if (length(peaks_eNotation) > 0) {
|
175 |
|
- message = paste0("Found at least one peak (", paste0(counts_peaks$peakID[peaks_eNotation], collapse = ",") , ") for which the position contains the scientific notation, attempting to fix.")
|
176 |
|
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
177 |
|
- counts_peaks$peakID[peaks_eNotation] = .removeScientificNotation_positions(counts_peaks$peakID[peaks_eNotation])
|
178 |
|
-
|
179 |
|
-
|
|
175 |
+ message = paste0("Found at least one peak (", paste0(counts_peaks$peakID[peaks_eNotation], collapse = ",") , ") for which the position contains the scientific notation, attempting to fix.")
|
|
176 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
|
177 |
+ counts_peaks$peakID[peaks_eNotation] = .removeScientificNotation_positions(counts_peaks$peakID[peaks_eNotation])
|
|
178 |
+
|
|
179 |
+
|
180 |
180 |
}
|
181 |
181 |
|
182 |
182 |
# Clean Ensembl IDs
|
... |
... |
@@ -189,10 +189,10 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
189 |
189 |
counts_rna = counts_rna %>%
|
190 |
190 |
dplyr::group_by(ENSEMBL) %>%
|
191 |
191 |
dplyr::summarise_if(is.numeric, sum)
|
192 |
|
- # dplyr::summarise_if(is.numeric, sum, .groups = 'drop') # the .drop caused an error with dplyr 1.0.5
|
|
192 |
+ # dplyr::summarise_if(is.numeric, sum, .groups = 'drop') # the .drop caused an error with dplyr 1.0.5
|
193 |
193 |
}
|
194 |
194 |
|
195 |
|
-
|
|
195 |
+
|
196 |
196 |
# Normalize counts
|
197 |
197 |
countsPeaks.norm.df = .normalizeCounts(counts_peaks, method = normalization_peaks, idColumn = "peakID")
|
198 |
198 |
countsRNA.norm.df = .normalizeCounts(counts_rna, method = normalization_rna, idColumn = "ENSEMBL")
|
... |
... |
@@ -219,20 +219,20 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
219 |
219 |
# Create permutations for RNA
|
220 |
220 |
futile.logger::flog.info(paste0( "Produce ", .getMaxPermutation(GRN), " permutations of RNA-counts"))
|
221 |
221 |
GRN@data$RNA$counts_norm.l = .shuffleColumns(countsRNA.norm.df, .getMaxPermutation(GRN), returnUnshuffled = TRUE, returnAsList = TRUE)
|
222 |
|
-
|
|
222 |
+
|
223 |
223 |
if (!is.null(sampleMetadata)) {
|
224 |
|
-
|
|
224 |
+
|
225 |
225 |
futile.logger::flog.info("Parsing provided metadata...")
|
226 |
226 |
GRN@data$metadata = sampleMetadata %>% dplyr::distinct() %>% tibble::set_tidy_names(syntactic = TRUE, quiet = TRUE)
|
227 |
227 |
|
228 |
228 |
# Force the first column to be the ID column
|
229 |
229 |
if ("sampleID" %in% colnames(GRN@data$metadata)) {
|
230 |
|
- futile.logger::flog.warn("Renaming the first column to sampleID. However, this column already exists, it will be renamed accordingly.")
|
231 |
|
- colnames(GRN@data$metadata)[which(colnames(GRN@data$metadata) == "sampleID")] = "sampleID_original"
|
232 |
|
-
|
|
230 |
+ futile.logger::flog.warn("Renaming the first column to sampleID. However, this column already exists, it will be renamed accordingly.")
|
|
231 |
+ colnames(GRN@data$metadata)[which(colnames(GRN@data$metadata) == "sampleID")] = "sampleID_original"
|
|
232 |
+
|
233 |
233 |
}
|
234 |
234 |
colnames(GRN@data$metadata)[1] = "sampleID"
|
235 |
|
-
|
|
235 |
+
|
236 |
236 |
# Assume the ID is in column 1, has to be unique
|
237 |
237 |
if (nrow(GRN@data$metadata) > length(unique(GRN@data$metadata$sampleID))) {
|
238 |
238 |
message = paste0("The first column in the sample metadata table must contain only unique values, as it is used as sample ID. Make sure the values are unique.")
|
... |
... |
@@ -252,8 +252,8 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
252 |
252 |
dplyr::mutate(has_RNA = sampleID %in% samples_rna,
|
253 |
253 |
has_peaks = sampleID %in% samples_peaks,
|
254 |
254 |
has_both = has_RNA & has_peaks
|
255 |
|
- )
|
256 |
|
-
|
|
255 |
+ )
|
|
256 |
+
|
257 |
257 |
GRN@config$sharedSamples = dplyr::filter(GRN@data$metadata, has_both) %>% dplyr::pull(sampleID) %>% as.character()
|
258 |
258 |
|
259 |
259 |
counts_peaks = as.data.frame(counts_peaks)
|
... |
... |
@@ -264,11 +264,11 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
264 |
264 |
|
265 |
265 |
|
266 |
266 |
# Store the raw peaks data efficiently as DESeq object only if it contains only integers, otherwise store as normal matrix
|
267 |
|
-
|
|
267 |
+
|
268 |
268 |
if (isIntegerMatrix(counts_peaks[, GRN@config$sharedSamples])) {
|
269 |
269 |
GRN@data$peaks$counts_orig = DESeq2::DESeqDataSetFromMatrix(counts_peaks[, GRN@config$sharedSamples],
|
270 |
|
- colData = dplyr::filter(GRN@data$metadata, has_both) %>% tibble::column_to_rownames("sampleID"),
|
271 |
|
- design = ~1)
|
|
270 |
+ colData = dplyr::filter(GRN@data$metadata, has_both) %>% tibble::column_to_rownames("sampleID"),
|
|
271 |
+ design = ~1)
|
272 |
272 |
} else {
|
273 |
273 |
|
274 |
274 |
GRN@data$peaks$counts_orig = as.matrix(counts_peaks[, GRN@config$sharedSamples])
|
... |
... |
@@ -276,14 +276,14 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
276 |
276 |
|
277 |
277 |
if (isIntegerMatrix(counts_rna[, GRN@config$sharedSamples])) {
|
278 |
278 |
GRN@data$RNA$counts_orig = DESeq2::DESeqDataSetFromMatrix(counts_rna[, GRN@config$sharedSamples],
|
279 |
|
- colData = dplyr::filter(GRN@data$metadata, has_both) %>% tibble::column_to_rownames("sampleID"),
|
280 |
|
- design = ~1)
|
|
279 |
+ colData = dplyr::filter(GRN@data$metadata, has_both) %>% tibble::column_to_rownames("sampleID"),
|
|
280 |
+ design = ~1)
|
281 |
281 |
} else {
|
282 |
|
- GRN@data$RNA$counts_orig = as.matrix(counts_rna[, GRN@config$sharedSamples])
|
|
282 |
+ GRN@data$RNA$counts_orig = as.matrix(counts_rna[, GRN@config$sharedSamples])
|
283 |
283 |
}
|
284 |
284 |
|
285 |
|
-
|
286 |
|
-
|
|
285 |
+
|
|
286 |
+
|
287 |
287 |
# Consensus peaks
|
288 |
288 |
GRN@data$peaks$consensusPeaks = .createConsensusPeaksDF(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE))
|
289 |
289 |
GRN@data$peaks$consensusPeaks = GRN@data$peaks$consensusPeaks[match(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)$peakID, GRN@data$peaks$consensusPeaks$peakID), ]
|
... |
... |
@@ -294,29 +294,29 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
294 |
294 |
# Assume 0-based exclusive format, see https://arnaudceol.wordpress.com/2014/09/18/chromosome-coordinate-systems-0-based-1-based/ and http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for details
|
295 |
295 |
consensus.gr = .constructGRanges(GRN@data$peaks$consensusPeaks, seqlengths = .getChrLengths(GRN@config$parameters$genomeAssembly), GRN@config$parameters$genomeAssembly, zeroBased = TRUE)
|
296 |
296 |
|
297 |
|
-
|
|
297 |
+
|
298 |
298 |
|
299 |
299 |
overlappingPeaks = which(GenomicRanges::countOverlaps(consensus.gr ,consensus.gr) >1)
|
300 |
300 |
|
301 |
301 |
if (length(overlappingPeaks) > 0){
|
|
302 |
+
|
|
303 |
+ ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID
|
|
304 |
+
|
|
305 |
+ messageAll = paste0(" ", length(overlappingPeaks),
|
|
306 |
+ " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","),
|
|
307 |
+ ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. "
|
|
308 |
+ )
|
|
309 |
+
|
|
310 |
+
|
|
311 |
+ if (allowOverlappingPeaks) {
|
302 |
312 |
|
303 |
|
- ids = (consensus.gr[overlappingPeaks] %>% as.data.frame())$peakID
|
304 |
|
-
|
305 |
|
- messageAll = paste0(" ", length(overlappingPeaks),
|
306 |
|
- " overlapping peaks have been identified. The first ten are: ", paste0(ids[seq_len(min(10, length(ids)))], collapse = ","),
|
307 |
|
- ". This may not be what you want, since overlapping peaks may have a heigher weight in the network. "
|
308 |
|
- )
|
309 |
|
-
|
310 |
|
-
|
311 |
|
- if (allowOverlappingPeaks) {
|
312 |
|
-
|
313 |
|
- message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.")
|
314 |
|
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
315 |
|
- } else {
|
316 |
|
- message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function")
|
317 |
|
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
|
318 |
|
- }
|
319 |
|
-
|
|
313 |
+ message = paste0(messageAll, "As allowOverlappingPeaks has been set to TRUE, this is only a warning and not an error.")
|
|
314 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
|
315 |
+ } else {
|
|
316 |
+ message = paste0(messageAll, "As allowOverlappingPeaks = FALSE (the default), this is an error and not a warning. You may want to regenerate the peak file, eliminate peak overlaps, and rerun this function")
|
|
317 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
|
|
318 |
+ }
|
|
319 |
+
|
320 |
320 |
}
|
321 |
321 |
|
322 |
322 |
}
|
... |
... |
@@ -326,7 +326,7 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
326 |
326 |
|
327 |
327 |
# Add gene annotation once
|
328 |
328 |
GRN = .populateGeneAnnotation(GRN)
|
329 |
|
-
|
|
329 |
+
|
330 |
330 |
GRN
|
331 |
331 |
|
332 |
332 |
}
|
... |
... |
@@ -358,14 +358,14 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
358 |
358 |
}
|
359 |
359 |
|
360 |
360 |
.removeScientificNotation_positions <- function(peakIDs.vec) {
|
361 |
|
- ids = strsplit(peakIDs.vec, split = ":", fixed = TRUE)
|
362 |
|
- ids_chr = sapply(ids, "[[", 1)
|
363 |
|
- ids_pos = sapply(ids, "[[", 2)
|
364 |
|
- ids_pos = strsplit(ids_pos, split = "-", fixed = TRUE)
|
365 |
|
- start = sapply(ids_pos, "[[", 1)
|
366 |
|
- end = sapply(ids_pos, "[[", 2)
|
367 |
|
-
|
368 |
|
- paste0(ids_chr, ":", format(as.integer(start), scientific = FALSE), "-", format(as.integer(end), scientific= FALSE))
|
|
361 |
+ ids = strsplit(peakIDs.vec, split = ":", fixed = TRUE)
|
|
362 |
+ ids_chr = sapply(ids, "[[", 1)
|
|
363 |
+ ids_pos = sapply(ids, "[[", 2)
|
|
364 |
+ ids_pos = strsplit(ids_pos, split = "-", fixed = TRUE)
|
|
365 |
+ start = sapply(ids_pos, "[[", 1)
|
|
366 |
+ end = sapply(ids_pos, "[[", 2)
|
|
367 |
+
|
|
368 |
+ paste0(ids_chr, ":", format(as.integer(start), scientific = FALSE), "-", format(as.integer(end), scientific= FALSE))
|
369 |
369 |
}
|
370 |
370 |
|
371 |
371 |
|
... |
... |
@@ -412,90 +412,90 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
412 |
412 |
}
|
413 |
413 |
|
414 |
414 |
.populatePeakAnnotation <- function (GRN) {
|
|
415 |
+
|
|
416 |
+ countsPeaks.clean = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
|
|
417 |
+
|
|
418 |
+ futile.logger::flog.info(paste0(" Calculate statistics for each peak (mean and CV)"))
|
|
419 |
+
|
|
420 |
+ countsPeaks.m = as.matrix(dplyr::select(countsPeaks.clean, -peakID))
|
|
421 |
+
|
|
422 |
+ rowMeans_peaks = rowMeans(countsPeaks.m)
|
|
423 |
+ rowMedians_peaks = matrixStats::rowMedians(countsPeaks.m)
|
|
424 |
+ CV_peaks = matrixStats::rowSds(countsPeaks.m) / rowMeans_peaks
|
|
425 |
+
|
|
426 |
+ metadata_peaks = tibble::tibble(peak.ID = countsPeaks.clean$peakID,
|
|
427 |
+ peak.mean = rowMeans_peaks,
|
|
428 |
+ peak.median = rowMedians_peaks,
|
|
429 |
+ peak.CV = CV_peaks)
|
|
430 |
+
|
|
431 |
+ GRN@annotation$consensusPeaks = metadata_peaks
|
|
432 |
+
|
|
433 |
+ if (!is.installed("ChIPseeker")) {
|
|
434 |
+ message = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ",
|
|
435 |
+ " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.")
|
|
436 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
|
437 |
+ } else {
|
415 |
438 |
|
416 |
|
- countsPeaks.clean = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
|
417 |
|
-
|
418 |
|
- futile.logger::flog.info(paste0(" Calculate statistics for each peak (mean and CV)"))
|
419 |
|
-
|
420 |
|
- countsPeaks.m = as.matrix(dplyr::select(countsPeaks.clean, -peakID))
|
421 |
|
-
|
422 |
|
- rowMeans_peaks = rowMeans(countsPeaks.m)
|
423 |
|
- rowMedians_peaks = matrixStats::rowMedians(countsPeaks.m)
|
424 |
|
- CV_peaks = matrixStats::rowSds(countsPeaks.m) / rowMeans_peaks
|
425 |
|
-
|
426 |
|
- metadata_peaks = tibble::tibble(peak.ID = countsPeaks.clean$peakID,
|
427 |
|
- peak.mean = rowMeans_peaks,
|
428 |
|
- peak.median = rowMedians_peaks,
|
429 |
|
- peak.CV = CV_peaks)
|
430 |
|
-
|
431 |
|
- GRN@annotation$consensusPeaks = metadata_peaks
|
432 |
|
-
|
433 |
|
- if (!is.installed("ChIPseeker")) {
|
434 |
|
- message = paste0("The package ChIPseeker is currently not installed, which is needed for additional peak annotation that can be useful for further downstream analyses. ",
|
435 |
|
- " You may want to install it and re-run this function. However, this is optional and except for some missing additional annotation columns, there is no limitation.")
|
436 |
|
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
|
437 |
|
- } else {
|
438 |
|
-
|
439 |
|
- futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while"))
|
440 |
|
- genomeAssembly = GRN@config$parameters$genomeAssembly
|
441 |
|
- consensusPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!isFiltered)
|
442 |
|
- consensusPeaks.gr = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
|
443 |
|
-
|
444 |
|
- # Add ChIPSeeker anotation
|
445 |
|
- peaks.annotated = suppressMessages(ChIPseeker::annotatePeak(
|
446 |
|
- consensusPeaks.gr,
|
447 |
|
- tssRegion = c(-5000, 5000), # extended from 3kb to 5
|
448 |
|
- TxDb = .getGenomeObject(genomeAssembly, type = "txbd"),
|
449 |
|
- level = "gene",
|
450 |
|
- assignGenomicAnnotation = TRUE, # the default
|
451 |
|
- genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
|
452 |
|
- "Downstream", "Intergenic"), # the default
|
453 |
|
- annoDb = .getGenomeObject(genomeAssembly, type = "packageName"), # optional, if provided, extra columns including SYMBOL, GENENAME, ENSEMBL/ENTREZID will be added
|
454 |
|
- sameStrand = FALSE, # the default
|
455 |
|
- ignoreOverlap = FALSE, # the default
|
456 |
|
- ignoreUpstream = FALSE, # the default
|
457 |
|
- ignoreDownstream = FALSE, # the default
|
458 |
|
- overlap = "TSS", # the default
|
459 |
|
- verbose = TRUE # the default
|
460 |
|
- ))
|
461 |
|
-
|
462 |
|
- GRN@annotation$consensusPeaks_obj = peaks.annotated
|
463 |
|
-
|
464 |
|
- peaks.annotated.df = as.data.frame(peaks.annotated)
|
465 |
|
- peaks.annotated.df$annotation[grepl("Exon", peaks.annotated.df$annotation)] = "Exon"
|
466 |
|
- peaks.annotated.df$annotation[grepl("Intron", peaks.annotated.df$annotation)] = "Intron"
|
467 |
|
-
|
468 |
|
- GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks,
|
469 |
|
- peaks.annotated.df %>%
|
470 |
|
- dplyr::select(peakID, annotation, tidyselect::starts_with("gene"), -geneId, distanceToTSS, ENSEMBL, SYMBOL, GENENAME) %>%
|
471 |
|
- dplyr::mutate(annotation = as.factor(annotation),
|
472 |
|
- ENSEMBL = as.factor(ENSEMBL),
|
473 |
|
- GENENAME = as.factor(GENENAME),
|
474 |
|
- SYMBOL = as.factor(SYMBOL)),
|
475 |
|
- by = c("peak.ID" = "peakID")) %>%
|
476 |
|
- dplyr::rename(peak.gene.chr = geneChr,
|
477 |
|
- peak.gene.start = geneStart,
|
478 |
|
- peak.gene.end = geneEnd,
|
479 |
|
- peak.gene.length = geneLength,
|
480 |
|
- peak.gene.strand = geneStrand,
|
481 |
|
- peak.gene.name = GENENAME,
|
482 |
|
- peak.gene.distanceToTSS = distanceToTSS,
|
483 |
|
- peak.gene.ENSEMBL = ENSEMBL,
|
484 |
|
- peak.gene.symbol = SYMBOL,
|
485 |
|
- peak.annotation = annotation
|
486 |
|
- )
|
487 |
|
-
|
488 |
|
-
|
489 |
|
- }
|
490 |
|
-
|
491 |
|
-
|
492 |
|
-
|
493 |
|
-
|
494 |
|
- # Also add GC content as annotation columns
|
495 |
|
- GRN = .calcGCContentPeaks(GRN)
|
|
439 |
+ futile.logger::flog.info(paste0(" Retrieve peak annotation using ChipSeeker. This may take a while"))
|
|
440 |
+ genomeAssembly = GRN@config$parameters$genomeAssembly
|
|
441 |
+ consensusPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!isFiltered)
|
|
442 |
+ consensusPeaks.gr = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
|
|
443 |
+
|
|
444 |
+ # Add ChIPSeeker anotation
|
|
445 |
+ peaks.annotated = suppressMessages(ChIPseeker::annotatePeak(
|
|
446 |
+ consensusPeaks.gr,
|
|
447 |
+ tssRegion = c(-5000, 5000), # extended from 3kb to 5
|
|
448 |
+ TxDb = .getGenomeObject(genomeAssembly, type = "txbd"),
|
|
449 |
+ level = "gene",
|
|
450 |
+ assignGenomicAnnotation = TRUE, # the default
|
|
451 |
+ genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
|
|
452 |
+ "Downstream", "Intergenic"), # the default
|
|
453 |
+ annoDb = .getGenomeObject(genomeAssembly, type = "packageName"), # optional, if provided, extra columns including SYMBOL, GENENAME, ENSEMBL/ENTREZID will be added
|
|
454 |
+ sameStrand = FALSE, # the default
|
|
455 |
+ ignoreOverlap = FALSE, # the default
|
|
456 |
+ ignoreUpstream = FALSE, # the default
|
|
457 |
+ ignoreDownstream = FALSE, # the default
|
|
458 |
+ overlap = "TSS", # the default
|
|
459 |
+ verbose = TRUE # the default
|
|
460 |
+ ))
|
|
461 |
+
|
|
462 |
+ GRN@annotation$consensusPeaks_obj = peaks.annotated
|
|
463 |
+
|
|
464 |
+ peaks.annotated.df = as.data.frame(peaks.annotated)
|
|
465 |
+ peaks.annotated.df$annotation[grepl("Exon", peaks.annotated.df$annotation)] = "Exon"
|
|
466 |
+ peaks.annotated.df$annotation[grepl("Intron", peaks.annotated.df$annotation)] = "Intron"
|
|
467 |
+
|
|
468 |
+ GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks,
|
|
469 |
+ peaks.annotated.df %>%
|
|
470 |
+ dplyr::select(peakID, annotation, tidyselect::starts_with("gene"), -geneId, distanceToTSS, ENSEMBL, SYMBOL, GENENAME) %>%
|
|
471 |
+ dplyr::mutate(annotation = as.factor(annotation),
|
|
472 |
+ ENSEMBL = as.factor(ENSEMBL),
|
|
473 |
+ GENENAME = as.factor(GENENAME),
|
|
474 |
+ SYMBOL = as.factor(SYMBOL)),
|
|
475 |
+ by = c("peak.ID" = "peakID")) %>%
|
|
476 |
+ dplyr::rename(peak.gene.chr = geneChr,
|
|
477 |
+ peak.gene.start = geneStart,
|
|
478 |
+ peak.gene.end = geneEnd,
|
|
479 |
+ peak.gene.length = geneLength,
|
|
480 |
+ peak.gene.strand = geneStrand,
|
|
481 |
+ peak.gene.name = GENENAME,
|
|
482 |
+ peak.gene.distanceToTSS = distanceToTSS,
|
|
483 |
+ peak.gene.ENSEMBL = ENSEMBL,
|
|
484 |
+ peak.gene.symbol = SYMBOL,
|
|
485 |
+ peak.annotation = annotation
|
|
486 |
+ )
|
496 |
487 |
|
497 |
|
- GRN
|
498 |
488 |
|
|
489 |
+ }
|
|
490 |
+
|
|
491 |
+
|
|
492 |
+
|
|
493 |
+
|
|
494 |
+ # Also add GC content as annotation columns
|
|
495 |
+ GRN = .calcGCContentPeaks(GRN)
|
|
496 |
+
|
|
497 |
+ GRN
|
|
498 |
+
|
499 |
499 |
}
|
500 |
500 |
|
501 |
501 |
.populateGeneAnnotation <- function (GRN) {
|
... |
... |
@@ -536,52 +536,52 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq_sizeFactor",
|
536 |
536 |
#' @importFrom IRanges width
|
537 |
537 |
#' @importFrom rlang .data `:=`
|
538 |
538 |
.calcGCContentPeaks <- function(GRN) {
|
539 |
|
-
|
540 |
|
- futile.logger::flog.info(paste0("Calculate GC-content for peaks... "))
|
541 |
|
- start = Sys.time()
|
542 |
|
- genomeAssembly = GRN@config$parameters$genomeAssembly
|
543 |
|
- #TODO: GC content for all peaks
|
544 |
|
- genome = .getGenomeObject(genomeAssembly, type = "BSgenome")
|
545 |
|
-
|
546 |
|
- # Get peaks as GRanges object
|
547 |
|
- query = .constructGRanges(GRN@data$peaks$consensusPeaks,
|
548 |
|
- seqlengths = .getChrLengths(genomeAssembly),
|
549 |
|
- genomeAssembly)
|
550 |
|
-
|
551 |
|
- # Get DNAStringSet object
|
552 |
|
- seqs_peaks = Biostrings::getSeq(genome, query)
|
553 |
|
-
|
554 |
|
- GC_content.df = Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT")
|
555 |
|
-
|
556 |
|
- GC_content.df = GC_content.df %>%
|
557 |
|
- tibble::as_tibble() %>%
|
558 |
|
- dplyr::mutate(length = IRanges::width(query),
|
559 |
|
- peak.ID = query$peakID,
|
560 |
|
- GC_class = cut(`G|C`, breaks = seq(0,1,0.1), include.lowest = TRUE, ordered_result = TRUE))
|
561 |
|
-
|
562 |
|
- GC_classes.df = GC_content.df %>%
|
563 |
|
- dplyr::group_by(GC_class) %>%
|
564 |
|
- dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(length), peak_width_sd = sd(length)) %>%
|
565 |
|
- dplyr::ungroup() %>%
|
566 |
|
- tidyr::complete(GC_class, fill = list(n = 0)) %>%
|
567 |
|
- dplyr::mutate(n_rel = .data$n / length(query))
|
568 |
|
-
|
569 |
|
- # TODO: Put where
|
570 |
|
- #ggplot(GC_content.df, aes(GC.class)) + geom_histogram(stat = "count") + theme_bw()
|
571 |
|
-
|
572 |
|
- #ggplot(GC_classes.df , aes(GC.class, n_rel)) + geom_bar(stat = "identity") + theme_bw()
|
573 |
|
-
|
574 |
|
- GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, GC_content.df, by = "peak.ID") %>%
|
575 |
|
- dplyr::rename( peak.GC.perc = `G|C`,
|
576 |
|
- peak.width = length,
|
577 |
|
- peak.GC.class = GC_class)
|
578 |
|
-
|
579 |
|
- GRN@stats$peaks = list()
|
580 |
|
- GRN@stats$peaks$GC = GC_classes.df
|
581 |
|
-
|
582 |
|
- .printExecutionTime(start)
|
583 |
|
-
|
584 |
|
- GRN
|
|
539 |
+
|
|
540 |
+ futile.logger::flog.info(paste0("Calculate GC-content for peaks... "))
|
|
541 |
+ start = Sys.time()
|
|
542 |
+ genomeAssembly = GRN@config$parameters$genomeAssembly
|
|
543 |
+ #TODO: GC content for all peaks
|
|
544 |
+ genome = .getGenomeObject(genomeAssembly, type = "BSgenome")
|
|
545 |
+
|
|
546 |
+ # Get peaks as GRanges object
|
|
547 |
+ query = .constructGRanges(GRN@data$peaks$consensusPeaks,
|
|
548 |
+ seqlengths = .getChrLengths(genomeAssembly),
|
|
549 |
+ genomeAssembly)
|
|
550 |
+
|
|
551 |
+ # Get DNAStringSet object
|
|
552 |
+ seqs_peaks = Biostrings::getSeq(genome, query)
|
|
553 |
+
|
|
554 |
+ GC_content.df = Biostrings::letterFrequency(seqs_peaks, "GC") / Biostrings::letterFrequency(seqs_peaks, "ACGT")
|
|
555 |
+
|
|
556 |
+ GC_content.df = GC_content.df %>%
|
|
557 |
+ tibble::as_tibble() %>%
|
|
558 |
+ dplyr::mutate(length = IRanges::width(query),
|
|
559 |
+ peak.ID = query$peakID,
|
|
560 |
+ GC_class = cut(`G|C`, breaks = seq(0,1,0.1), include.lowest = TRUE, ordered_result = TRUE))
|
|
561 |
+
|
|
562 |
+ GC_classes.df = GC_content.df %>%
|
|
563 |
+ dplyr::group_by(GC_class) %>%
|
|
564 |
+ dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(length), peak_width_sd = sd(length)) %>%
|
|
565 |
+ dplyr::ungroup() %>%
|
|
566 |
+ tidyr::complete(GC_class, fill = list(n = 0)) %>%
|
|
567 |
+ dplyr::mutate(n_rel = .data$n / length(query))
|
|
568 |
+
|
|
569 |
+ # TODO: Put where
|
|
570 |
+ #ggplot(GC_content.df, aes(GC.class)) + geom_histogram(stat = "count") + theme_bw()
|
|
571 |
+
|
|
572 |
+ #ggplot(GC_classes.df , aes(GC.class, n_rel)) + geom_bar(stat = "identity") + theme_bw()
|
|
573 |
+
|
|
574 |
+ GRN@annotation$consensusPeaks = dplyr::left_join(GRN@annotation$consensusPeaks, GC_content.df, by = "peak.ID") %>%
|
|
575 |
+ dplyr::rename( peak.GC.perc = `G|C`,
|
|
576 |
+ peak.width = length,
|
|
577 |
+ peak.GC.class = GC_class)
|
|
578 |
+
|
|
579 |
+ GRN@stats$peaks = list()
|
|
580 |
+ GRN@stats$peaks$GC = GC_classes.df
|
|
581 |
+
|
|
582 |
+ .printExecutionTime(start)
|
|
583 |
+
|
|
584 |
+ GRN
|
585 |
585 |
}
|
586 |
586 |
|
587 |
587 |
#' Filter data from a \code{\linkS4class{GRN}} object
|
... |
... |
@@ -637,8 +637,8 @@ filterData <- function (GRN,
|
637 |
637 |
if(!is.null(GRN@data$TFs$TF_peak_overlap)) {
|
638 |
638 |
GRN@data$TFs$TF_peak_overlap[, "isFiltered"] = 0
|
639 |
639 |
}
|
640 |
|
-
|
641 |
|
-
|
|
640 |
+
|
|
641 |
+
|
642 |
642 |
# Filter peaks
|
643 |
643 |
futile.logger::flog.info("FILTER PEAKS")
|
644 |
644 |
peakIDs.CV = .filterPeaksByMeanCV(GRN,
|
... |
... |
@@ -660,7 +660,7 @@ filterData <- function (GRN,
|
660 |
660 |
#GRN@data$peaks$counts_raw$isFiltered = ! GRN@data$peaks$counts_raw$peakID %in% peaks_toKeep
|
661 |
661 |
GRN@data$peaks$counts_norm$isFiltered = ! GRN@data$peaks$counts_norm$peakID %in% peaks_toKeep
|
662 |
662 |
|
663 |
|
-
|
|
663 |
+
|
664 |
664 |
if(!is.null(GRN@data$TFs$TF_peak_overlap)) {
|
665 |
665 |
GRN@data$TFs$TF_peak_overlap[, "isFiltered"] = as.integer (! rownames(GRN@data$TFs$TF_peak_overlap) %in% peaks_toKeep)
|
666 |
666 |
}
|
... |
... |
@@ -671,9 +671,9 @@ filterData <- function (GRN,
|
671 |
671 |
# Filter peaks
|
672 |
672 |
futile.logger::flog.info("FILTER RNA-seq")
|
673 |
673 |
genes.CV = .filterGenesByMeanCV(GRN,
|
674 |
|
- minMean = minNormalizedMeanRNA, maxMean = maxNormalizedMeanRNA,
|
675 |
|
- minCV = minCV_genes, maxCV = maxCV_genes)
|
676 |
|
-
|
|
674 |
+ minMean = minNormalizedMeanRNA, maxMean = maxNormalizedMeanRNA,
|
|
675 |
+ minCV = minCV_genes, maxCV = maxCV_genes)
|
|
676 |
+
|
677 |
677 |
futile.logger::flog.info(paste0(" Number of rows in total: ", nrow(GRN@data$RNA$counts_norm.l[["0"]])))
|
678 |
678 |
for (permutationCur in c(0)) {
|
679 |
679 |
permIndex = as.character(permutationCur)
|
... |
... |
@@ -681,7 +681,7 @@ filterData <- function (GRN,
|
681 |
681 |
GRN@data$RNA$counts_norm.l[[permIndex]]$isFiltered = rowMeans < minNormalizedMeanRNA
|
682 |
682 |
}
|
683 |
683 |
nRowsFlagged = length(which(GRN@data$RNA$counts_norm.l[["0"]]$isFiltered))
|
684 |
|
-
|
|
684 |
+
|
685 |
685 |
# Raw counts are left untouched and filtered where needed only
|
686 |
686 |
futile.logger::flog.info(paste0(" Flagged ", nRowsFlagged, " rows because the row mean was smaller than ", minNormalizedMeanRNA))
|
687 |
687 |
|
... |
... |
@@ -699,7 +699,7 @@ filterData <- function (GRN,
|
699 |
699 |
startTime = Sys.time()
|
700 |
700 |
|
701 |
701 |
if (is.null(minSize_peaks)) {
|
702 |
|
- minSize_peaks = 1
|
|
702 |
+ minSize_peaks = 1
|
703 |
703 |
}
|
704 |
704 |
|
705 |
705 |
futile.logger::flog.info(paste0("Filter and sort peaks and remain only those on the following chromosomes: ", paste0(chrToKeep, collapse = ",")))
|
... |
... |
@@ -730,111 +730,111 @@ filterData <- function (GRN,
|
730 |
730 |
}
|
731 |
731 |
|
732 |
732 |
.filterPeaksByCV <- function (GRN, minCV = 0, maxCV = 1e7) {
|
|
733 |
+
|
|
734 |
+ startTime = Sys.time()
|
|
735 |
+
|
|
736 |
+ if (is.null(maxCV)) {
|
|
737 |
+ futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV))
|
733 |
738 |
|
734 |
|
- startTime = Sys.time()
|
735 |
|
-
|
736 |
|
- if (is.null(maxCV)) {
|
737 |
|
- futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV))
|
738 |
|
-
|
739 |
|
- peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, peak.CV >= minCV)
|
740 |
|
-
|
741 |
|
- } else {
|
742 |
|
- futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV, ", max CV = ", maxCV))
|
743 |
|
-
|
744 |
|
- peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, peak.CV >= minCV, peak.CV <= maxCV)
|
745 |
|
- }
|
746 |
|
-
|
747 |
|
- futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
|
|
739 |
+ peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, peak.CV >= minCV)
|
748 |
740 |
|
749 |
|
- .printExecutionTime(startTime)
|
|
741 |
+ } else {
|
|
742 |
+ futile.logger::flog.info(paste0("Filter peaks by CV: Min CV = ", minCV, ", max CV = ", maxCV))
|
750 |
743 |
|
751 |
|
- peaksFiltered$peak.ID
|
|
744 |
+ peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks, peak.CV >= minCV, peak.CV <= maxCV)
|
|
745 |
+ }
|
|
746 |
+
|
|
747 |
+ futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
|
|
748 |
+
|
|
749 |
+ .printExecutionTime(startTime)
|
|
750 |
+
|
|
751 |
+ peaksFiltered$peak.ID
|
752 |
752 |
}
|
753 |
753 |
|
754 |
754 |
.filterPeaksByMeanCV <- function (GRN, minMean = 0, maxMean = NULL, minCV = 0, maxCV = NULL) {
|
|
755 |
+
|
|
756 |
+ startTime = Sys.time()
|
|
757 |
+
|
|
758 |
+ futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$consensusPeaks)))
|
|
759 |
+
|
|
760 |
+ if (is.null(minCV)) {
|
|
761 |
+ minCV = 0
|
|
762 |
+ }
|
|
763 |
+
|
|
764 |
+ if (is.null(maxCV)) {
|
|
765 |
+ futile.logger::flog.info(paste0(" Filter peaks by CV: Min = ", minCV))
|
|
766 |
+ maxCV = 9e+99
|
755 |
767 |
|
756 |
|
- startTime = Sys.time()
|
757 |
|
-
|
758 |
|
- futile.logger::flog.info(paste0(" Number of peaks before filtering : ", nrow(GRN@annotation$consensusPeaks)))
|
759 |
|
-
|
760 |
|
- if (is.null(minCV)) {
|
761 |
|
- minCV = 0
|
762 |
|
- }
|
763 |
|
-
|
764 |
|
- if (is.null(maxCV)) {
|
765 |
|
- futile.logger::flog.info(paste0(" Filter peaks by CV: Min = ", minCV))
|
766 |
|
- maxCV = 9e+99
|
767 |
|
-
|
768 |
|
- } else {
|
769 |
|
- futile.logger::flog.info(paste0(" Filter peaks by CV: Min = ", minCV, ", Max = ", maxCV))
|
770 |
|
- }
|
771 |
|
-
|
772 |
|
-
|
773 |
|
- if (is.null(minMean)) {
|
774 |
|
- minMean = 0
|
775 |
|
- }
|
776 |
|
-
|
777 |
|
- if (is.null(maxMean)) {
|
778 |
|
- futile.logger::flog.info(paste0(" Filter peaks by mean: Min = ", minMean))
|
779 |
|
- maxMean = 9e+99
|
780 |
|
- } else {
|
781 |
|
- futile.logger::flog.info(paste0(" Filter peaks by mean: Min = ", minMean, ", Max = ", maxMean))
|
782 |
|
- }
|
783 |
|
-
|
784 |
|
-
|
785 |
|
- peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks,
|
786 |
|
- peak.CV >= minCV, peak.CV <= maxCV,
|
787 |
|
- peak.mean >= minMean, peak.mean <= maxMean)
|
788 |
|
-
|
789 |
|
- futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
|
790 |
|
-
|
791 |
|
- .printExecutionTime(startTime)
|
792 |
|
-
|
793 |
|
- peaksFiltered$peak.ID
|
|
768 |
+ } else {
|
|
769 |
+ futile.logger::flog.info(paste0(" Filter peaks by CV: Min = ", minCV, ", Max = ", maxCV))
|
|
770 |
+ }
|
|
771 |
+
|
|
772 |
+
|
|
773 |
+ if (is.null(minMean)) {
|
|
774 |
+ minMean = 0
|
|
775 |
+ }
|
|
776 |
+
|
|
777 |
+ if (is.null(maxMean)) {
|
|
778 |
+ futile.logger::flog.info(paste0(" Filter peaks by mean: Min = ", minMean))
|
|
779 |
+ maxMean = 9e+99
|
|
780 |
+ } else {
|
|
781 |
+ futile.logger::flog.info(paste0(" Filter peaks by mean: Min = ", minMean, ", Max = ", maxMean))
|
|
782 |
+ }
|
|
783 |
+
|
|
784 |
+
|
|
785 |
+ peaksFiltered = dplyr::filter(GRN@annotation$consensusPeaks,
|
|
786 |
+ peak.CV >= minCV, peak.CV <= maxCV,
|
|
787 |
+ peak.mean >= minMean, peak.mean <= maxMean)
|
|
788 |
+
|
|
789 |
+ futile.logger::flog.info(paste0(" Number of peaks after filtering : ", nrow(peaksFiltered)))
|
|
790 |
+
|
|
791 |
+ .printExecutionTime(startTime)
|
|
792 |
+
|
|
793 |
+ peaksFiltered$peak.ID
|
794 |
794 |
}
|
795 |
795 |
|
796 |
796 |
.filterGenesByMeanCV <- function (GRN, minMean = 0, maxMean = NULL, minCV = 0, maxCV = NULL) {
|
|
797 |
+
|
|
798 |
+ startTime = Sys.time()
|
|
799 |
+
|
|
800 |
+ futile.logger::flog.info(paste0(" Number of genes before filtering : ", nrow(GRN@annotation$genes)))
|
|
801 |
+
|
|
802 |
+ if (is.null(minCV)) {
|
|
803 |
+ minCV = 0
|
|
804 |
+ }
|
|
805 |
+
|
|
806 |
+ if (is.null(maxCV)) {
|
|
807 |
+ futile.logger::flog.info(paste0(" Filter genes by CV: Min = ", minCV))
|
|
808 |
+ maxCV = 9e+99
|
797 |
809 |
|
798 |
|
- startTime = Sys.time()
|
799 |
|
-
|
800 |
|
- futile.logger::flog.info(paste0(" Number of genes before filtering : ", nrow(GRN@annotation$genes)))
|
801 |
|
-
|
802 |
|
- if (is.null(minCV)) {
|
803 |
|
- minCV = 0
|
804 |
|
- }
|
805 |
|
-
|
806 |
|
- if (is.null(maxCV)) {
|
807 |
|
- futile.logger::flog.info(paste0(" Filter genes by CV: Min = ", minCV))
|
808 |
|
- maxCV = 9e+99
|
809 |
|
-
|
810 |
|
- } else {
|
811 |
|
- futile.logger::flog.info(paste0(" Filter genes by CV: Min = ", minCV, ", Max = ", maxCV))
|
812 |
|
- }
|
813 |
|
-
|
814 |
|
-
|
815 |
|
- if (is.null(minMean)) {
|
816 |
|
- minMean = 0
|
817 |
|
- }
|
818 |
|
-
|
819 |
|
-
|
820 |
|
- if (is.null(maxMean)) {
|
821 |
|
- futile.logger::flog.info(paste0(" Filter genes by mean: Min = ", minMean))
|
822 |
|
- maxMean = 9e+99
|
823 |
|
- } else {
|
824 |
|
- futile.logger::flog.info(paste0(" Filter genes by mean: Min = ", minMean, ", Max = ", maxMean))
|
825 |
|
- }
|
826 |
|
-
|
827 |
|
-
|
828 |
|
- genesFiltered = dplyr::filter(GRN@annotation$genes,
|
829 |
|
- gene.CV >= minCV, gene.CV <= maxCV,
|
830 |
|
- gene.mean >= minMean, gene.mean <= maxMean)
|
831 |
|
-
|
832 |
|
-
|
833 |
|
- futile.logger::flog.info(paste0(" Number of genes after filtering : ", nrow(genesFiltered)))
|
834 |
|
-
|
835 |
|
- .printExecutionTime(startTime)
|
836 |
|
-
|
837 |
|
- genesFiltered$gene.ENSEMBL
|
|
810 |
+ } else {
|
|
811 |
+ futile.logger::flog.info(paste0(" Filter genes by CV: Min = ", minCV, ", Max = ", maxCV))
|
|
812 |
+ }
|
|
813 |
+
|
|
814 |
+
|
|
815 |
+ if (is.null(minMean)) {
|
|
816 |
+ minMean = 0
|
|
817 |
+ }
|
|
818 |
+
|
|
819 |
+
|
|
820 |
+ if (is.null(maxMean)) {
|
|
821 |
+ futile.logger::flog.info(paste0(" Filter genes by mean: Min = ", minMean))
|
|
822 |
+ maxMean = 9e+99
|
|
823 |
+ } else {
|
|
824 |
+ futile.logger::flog.info(paste0(" Filter genes by mean: Min = ", minMean, ", Max = ", maxMean))
|
|
825 |
+ }
|
|
826 |
+
|
|
827 |
+
|
|
828 |
+ genesFiltered = dplyr::filter(GRN@annotation$genes,
|
|
829 |
+ gene.CV >= minCV, gene.CV <= maxCV,
|
|
830 |
+ gene.mean >= minMean, gene.mean <= maxMean)
|
|
831 |
+
|
|
832 |
+
|
|
833 |
+ futile.logger::flog.info(paste0(" Number of genes after filtering : ", nrow(genesFiltered)))
|
|
834 |
+
|
|
835 |
+ .printExecutionTime(startTime)
|
|
836 |
+
|
|
837 |
+ genesFiltered$gene.ENSEMBL
|
838 |
838 |
}
|
839 |
839 |
|
840 |
840 |
|
... |
... |
@@ -855,9 +855,9 @@ filterData <- function (GRN,
|
855 |
855 |
#' # See the Workflow vignette on the GRaNIE website for examples
|
856 |
856 |
#' @export
|
857 |
857 |
addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPattern = "_TFBS", fileEnding = ".bed", forceRerun = FALSE) {
|
858 |
|
-
|
|
858 |
+
|
859 |
859 |
GRN = .addFunctionLogToObject(GRN)
|
860 |
|
-
|
|
860 |
+
|
861 |
861 |
checkmate::assertClass(GRN, "GRN")
|
862 |
862 |
checkmate::assertDirectoryExists(motifFolder)
|
863 |
863 |
checkmate::assertCharacter(TFs, min.len = 1)
|
... |
... |
@@ -867,12 +867,12 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
|
867 |
867 |
checkmate::assertFlag(forceRerun)
|
868 |
868 |
|
869 |
869 |
if (is.null(GRN@data$TFs$translationTable) | is.null(GRN@data$TFs$translationTable) | is.null(GRN@config$allTF) | is.null(GRN@config$directories$motifFolder) | forceRerun) {
|
870 |
|
-
|
|
870 |
+
|
871 |
871 |
GRN@config$TFBS_fileEnding = fileEnding
|
872 |
872 |
GRN@config$TFBS_filePattern = filesTFBSPattern
|
873 |
873 |
GRN@data$TFs$translationTable = .getFinalListOfTFs(motifFolder, filesTFBSPattern, fileEnding, TFs, nTFMax, getCounts(GRN, type = "rna", norm = TRUE, permuted = FALSE))
|
874 |
874 |
|
875 |
|
-
|
|
875 |
+
|
876 |
876 |
GRN@data$TFs$translationTable = GRN@data$TFs$translationTable %>%
|
877 |
877 |
dplyr::select(c("HOCOID", "ENSEMBL")) %>%
|
878 |
878 |
dplyr::mutate(TF.name = HOCOID, TF.ENSEMBL = ENSEMBL)
|
... |
... |
@@ -884,8 +884,8 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
|
884 |
884 |
# GRN@config$TF_list = list()
|
885 |
885 |
# GRN@config$TF_list[["all_TFBS"]] =GRN@config$allTF
|
886 |
886 |
GRN@config$directories$motifFolder = motifFolder
|
887 |
|
-
|
888 |
|
-
|
|
887 |
+
|
|
888 |
+
|
889 |
889 |
}
|
890 |
890 |
|
891 |
891 |
GRN
|
... |
... |
@@ -899,9 +899,9 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
|
899 |
899 |
files = .createFileList(folder_input_TFBS, "*.bed*", recursive = FALSE, ignoreCase = FALSE, verbose = FALSE)
|
900 |
900 |
TFsWithTFBSPredictions = gsub(pattern = filesTFBSPattern, "", tools::file_path_sans_ext(basename(files), compression = TRUE))
|
901 |
901 |
TFsWithTFBSPredictions = gsub(pattern = fileEnding, "", TFsWithTFBSPredictions)
|
902 |
|
-
|
|
902 |
+
|
903 |
903 |
futile.logger::flog.info(paste0("Found ", length(TFsWithTFBSPredictions), " matching TFs: ", paste0(TFsWithTFBSPredictions, collapse = ", ")))
|
904 |
|
-
|
|
904 |
+
|
905 |
905 |
|
906 |
906 |
# Filter TFs
|
907 |
907 |
if (length(TFs) == 1 && TFs == "all") {
|
... |
... |
@@ -946,7 +946,7 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
|
946 |
946 |
allTF = allTF[seq_len(nTFMax)]
|
947 |
947 |
futile.logger::flog.info(paste0("Updated list of TFs: ", paste0(allTF, collapse = ", ")))
|
948 |
948 |
}
|
949 |
|
-
|
|
949 |
+
|
950 |
950 |
}
|
951 |
951 |
|
952 |
952 |
nTF = length(allTF)
|
... |
... |
@@ -974,18 +974,18 @@ addTFBS <- function(GRN, motifFolder, TFs = "all", nTFMax = NULL, filesTFBSPatte
|
974 |
974 |
#' GRN = overlapPeaksAndTFBS(GRN, nCores = 2, forceRerun = FALSE)
|
975 |
975 |
#' @export
|
976 |
976 |
overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
|
977 |
|
-
|
|
977 |
+
|
978 |
978 |
GRN = .addFunctionLogToObject(GRN)
|
979 |
|
-
|
|
979 |
+
|
980 |
980 |
checkmate::assertClass(GRN, "GRN")
|
981 |
981 |
checkmate::assertIntegerish(nCores, lower = 1)
|
982 |
982 |
checkmate::assertFlag(forceRerun)
|
983 |
983 |
|
984 |
984 |
if (is.null(GRN@data$TFs$TF_peak_overlap) | forceRerun) {
|
985 |
985 |
|
986 |
|
-
|
|
986 |
+
|
987 |
987 |
futile.logger::flog.info(paste0("Overlap peaks and TFBS using ", nCores, " cores. This may take a few minutes..."))
|
988 |
|
-
|
|
988 |
+
|
989 |
989 |
genomeAssembly = GRN@config$parameters$genomeAssembly
|
990 |
990 |
seqlengths = .getChrLengths(genomeAssembly)
|
991 |
991 |
|
... |
... |
@@ -995,7 +995,7 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
|
995 |
995 |
message = "Could not retrieve value from GRN@config$TFBS_filePattern. Please rerun the function addTFBS, as this was added in a recent version of the package."
|
996 |
996 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
|
997 |
997 |
}
|
998 |
|
-
|
|
998 |
+
|
999 |
999 |
|
1000 |
1000 |
# Check whether we have peaks on chromosomes not part of the sequence length reference. If yes, discard them
|
1001 |
1001 |
annotation_discared = dplyr::filter(GRN@data$peaks$consensusPeaks, ! .data$chr %in% names(seqlengths))
|
... |
... |
@@ -1006,8 +1006,8 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
|
1006 |
1006 |
tbl_discarded = tbl_discarded[which(tbl_discarded > 0)]
|
1007 |
1007 |
|
1008 |
1008 |
futile.logger::flog.warn(paste0("Found ", sum(tbl_discarded), " regions from chromosomes without a reference length. ",
|
1009 |
|
- "Typically, these are random fragments from known or unknown chromosomes. The following regions will be discarded: \n",
|
1010 |
|
- paste0(names(tbl_discarded), " (", tbl_discarded, ")", collapse = ",")))
|
|
1009 |
+ "Typically, these are random fragments from known or unknown chromosomes. The following regions will be discarded: \n",
|
|
1010 |
+ paste0(names(tbl_discarded), " (", tbl_discarded, ")", collapse = ",")))
|
1011 |
1011 |
|
1012 |
1012 |
GRN@data$peaks$consensusPeaks = dplyr::filter(GRN@data$peaks$consensusPeaks, .data$chr %in% names(seqlengths))
|
1013 |
1013 |
}
|
... |
... |
@@ -1027,9 +1027,9 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
|
1027 |
1027 |
if (!all(colnames(TFBS_bindingMatrix.df) %in% GRN@config$allTF)) {
|
1028 |
1028 |
|
1029 |
1029 |
message = paste0("Internal mismatch detected between the TF names and the TF names derived from the translation file (see log, column HOCOID).",
|
1030 |
|
- "This may happen if the genome assembly version has been changed, but intermediate files have not been properly recreated. ",
|
1031 |
|
- "Set the parameter forceRerun to TRUE and rerun the script.")
|
1032 |
|
-
|
|
1030 |
+ "This may happen if the genome assembly version has been changed, but intermediate files have not been properly recreated. ",
|
|
1031 |
+ "Set the parameter forceRerun to TRUE and rerun the script.")
|
|
1032 |
+
|
1033 |
1033 |
.checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
|
1034 |
1034 |
}
|
1035 |
1035 |
|
... |
... |
@@ -1037,11 +1037,11 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
|
1037 |
1037 |
filteredPeaks = dplyr::filter(GRN@data$peaks$counts_norm, isFiltered) %>% dplyr::pull(peakID)
|
1038 |
1038 |
# Collect binary 0/1 binding matrix from all TF and concatenate
|
1039 |
1039 |
GRN@data$TFs$TF_peak_overlap = TFBS_bindingMatrix.df %>%
|
1040 |
|
- dplyr::mutate(peakID = GRN@data$peaks$consensusPeaks$peakID,
|
1041 |
|
- isFiltered = peakID %in% filteredPeaks) %>%
|
1042 |
|
- dplyr::mutate_if(is.logical, as.numeric) %>%
|
1043 |
|
- dplyr::select(tidyselect::all_of(sort(GRN@config$allTF)), isFiltered)
|
1044 |
|
-
|
|
1040 |
+ dplyr::mutate(peakID = GRN@data$peaks$consensusPeaks$peakID,
|
|
1041 |
+ isFiltered = peakID %in% filteredPeaks) %>%
|
|
1042 |
+ dplyr::mutate_if(is.logical, as.numeric) %>%
|
|
1043 |
+ dplyr::select(tidyselect::all_of(sort(GRN@config$allTF)), isFiltered)
|
|
1044 |
+
|
1045 |
1045 |
GRN@data$TFs$TF_peak_overlap = .asSparseMatrix(as.matrix(GRN@data$TFs$TF_peak_overlap),
|
1046 |
1046 |
convertNA_to_zero = FALSE,
|
1047 |
1047 |
dimnames = list(GRN@data$peaks$consensusPeaks$peakID, colnames(GRN@data$TFs$TF_peak_overlap)))
|
... |
... |
@@ -1050,7 +1050,7 @@ overlapPeaksAndTFBS <- function(GRN, nCores = 2, forceRerun = FALSE) {
|
1050 |
1050 |
# We resort it to match the countsPeaks.norm
|
1051 |
1051 |
# TODO: Here could be an error due to differences in sorting. Also, whether or not all or the filtered version shall be used
|
1052 |
1052 |
stopifnot(identical(rownames(GRN@data$TFs$TF_peak_overlap), GRN@data$peaks$counts_norm$peakID))
|
1053 |
|
-
|
|
1053 |
+
|
1054 |
1054 |
}
|
1055 |
1055 |
|
1056 |
1056 |
GRN
|
... |
... |
@@ -1233,7 +1233,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
|
1233 |
1233 |
checkmate::assertChoice(normalization, c("cyclicLoess", "sizeFactors", "quantile", "none"))
|
1234 |
1234 |
checkmate::assertCharacter(name, min.chars = 1, len = 1)
|
1235 |
1235 |
checkmate::assertFlag(forceRerun)
|
1236 |
|
-
|
|
1236 |
+
|
1237 |
1237 |
forbiddenNames = "expression"
|
1238 |
1238 |
.checkForbiddenNames(name, forbiddenNames)
|
1239 |
1239 |
|
... |
... |
@@ -1247,7 +1247,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
|
1247 |
1247 |
tibble::as_tibble()
|
1248 |
1248 |
|
1249 |
1249 |
countsPeaks = .normalizeNewDataWrapper(GRN@data$peaks$counts_orig, normalization = normalization)
|
1250 |
|
-
|
|
1250 |
+
|
1251 |
1251 |
stopifnot(identical(nrow(countsPeaks), nrow(GRN@data$TFs$TF_peak_overlap)))
|
1252 |
1252 |
|
1253 |
1253 |
#Select a maximum set of TFs to run this for
|
... |
... |
@@ -1258,10 +1258,10 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
|
1258 |
1258 |
# Calculating TF activity is done for all TF that are available
|
1259 |
1259 |
TF.activity.m = matrix(NA, nrow = length(allTF), ncol = length(GRN@config$sharedSamples),
|
1260 |
1260 |
dimnames = list(allTF, GRN@config$sharedSamples))
|
1261 |
|
-
|
|
1261 |
+
|
1262 |
1262 |
|
1263 |
1263 |
pb <- progress::progress_bar$new(total = length(allTF))
|
1264 |
|
-
|
|
1264 |
+
|
1265 |
1265 |
for (TFCur in allTF) {
|
1266 |
1266 |
|
1267 |
1267 |
pb$tick()
|
... |
... |
@@ -1297,7 +1297,7 @@ addData_TFActivity <- function(GRN, normalization = "cyclicLoess", name = "TF_ac
|
1297 |
1297 |
|
1298 |
1298 |
futile.logger::flog.info(paste0("Data already exists in object (slot ", name, "), nothing to do. Set forceRerun = TRUE to regenerate and overwrite."))
|
1299 |
1299 |
}
|
1300 |
|
-
|
|
1300 |
+
|
1301 |
1301 |
|
1302 |
1302 |
.printExecutionTime(start)
|
1303 |
1303 |
|
... |
... |
@@ -1416,7 +1416,7 @@ importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF
|
1416 |
1416 |
|
1417 |
1417 |
forbiddenNames = c("TF_activity", "expression")
|
1418 |
1418 |
.checkForbiddenNames(name, forbiddenNames)
|
1419 |
|
-
|
|
1419 |
+
|
1420 |
1420 |
idColumns = idColumn
|
1421 |
1421 |
data = dplyr::rename(data, TF.name = !!(nameColumn))
|
1422 |
1422 |
idColumns = c(idColumns, "TF.name")
|
... |
... |
@@ -1481,7 +1481,7 @@ importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF
|
1481 |
1481 |
|
1482 |
1482 |
|
1483 |
1483 |
} else {
|
1484 |
|
-
|
|
1484 |
+
|
1485 |
1485 |
futile.logger::flog.info(paste0("Data already exists in object, nothing to do. Set forceRerun = TRUE to regenerate and overwrite."))
|
1486 |
1486 |
|
1487 |
1487 |
}
|
... |
... |
@@ -1507,7 +1507,7 @@ importTFData <- function(GRN, data, name, idColumn = "ENSEMBL", nameColumn = "TF
|
1507 |
1507 |
#' @examples
|
1508 |
1508 |
#' # See the Workflow vignette on the GRaNIE website for examples
|
1509 |
1509 |
#' GRN = loadExampleObject()
|
1510 |
|
-#' GRN = AR_classification_wrapper(GRN, forceRerun = FALSE)
|
|
1510 |
+#' GRN = AR_classification_wrapper(GRN, outputFolder = ".", forceRerun = FALSE)
|
1511 |
1511 |
#' @export
|
1512 |
1512 |
AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05,
|
1513 |
1513 |
plot_minNoTFBS_heatmap = 100, deleteIntermediateData = TRUE,
|
... |
... |
@@ -1516,7 +1516,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
|
1516 |
1516 |
forceRerun = FALSE) {
|
1517 |
1517 |
|
1518 |
1518 |
GRN = .addFunctionLogToObject(GRN)
|
1519 |
|
-
|
|
1519 |
+
|
1520 |
1520 |
checkmate::assertClass(GRN, "GRN")
|
1521 |
1521 |
checkmate::assertNumber(significanceThreshold_Wilcoxon, lower = 0, upper = 1)
|
1522 |
1522 |
checkmate::assertNumber(plot_minNoTFBS_heatmap, lower = 1)
|
... |
... |
@@ -1544,7 +1544,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
|
1544 |
1544 |
for (connectionTypeCur in connectionTypes) {
|
1545 |
1545 |
|
1546 |
1546 |
futile.logger::flog.info(paste0(" Connection type ", connectionTypeCur, "\n"))
|
1547 |
|
-
|
|
1547 |
+
|
1548 |
1548 |
for (permutationCur in allPermutations) {
|
1549 |
1549 |
|
1550 |
1550 |
futile.logger::flog.info(paste0(" ", .getPermStr(permutationCur), "\n"))
|
... |
... |
@@ -1562,6 +1562,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
|
1562 |
1562 |
is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background) |
|
1563 |
1563 |
is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground) |
|
1564 |
1564 |
is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background) |
|
|
1565 |
+ is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor) |
|
1565 |
1566 |
forceRerun
|
1566 |
1567 |
) {
|
1567 |
1568 |
|
... |
... |
@@ -1569,17 +1570,17 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
|
1569 |
1570 |
|
1570 |
1571 |
if (connectionTypeCur == "expression") {
|
1571 |
1572 |
counts1 = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(permutationCur))
|
1572 |
|
-
|
|
1573 |
+
|
1573 |
1574 |
} else {
|
1574 |
1575 |
|
1575 |
1576 |
# TF activity data
|
1576 |
1577 |
counts1 = GRN@data$TFs[[connectionTypeCur]] %>%
|
1577 |
1578 |
dplyr::select(-TF.name)
|
1578 |
|
-
|
|
1579 |
+
|
1579 |
1580 |
}
|
1580 |
1581 |
|
1581 |
1582 |
futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts"))
|
1582 |
|
-
|
|
1583 |
+
|
1583 |
1584 |
counts_peaks = getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE)
|
1584 |
1585 |
|
1585 |
1586 |
TF_peak_cor = .correlateMatrices(matrix1 = counts1,
|
... |
... |
@@ -1592,13 +1593,15 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
|
1592 |
1593 |
GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background = res.l[["median_background"]]
|
1593 |
1594 |
GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground = res.l[["foreground"]]
|
1594 |
1595 |
GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background = res.l[["background"]]
|
|
1596 |
+
|
|
1597 |
+ GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor = TF_peak_cor
|
1595 |
1598 |
}
|
1596 |
1599 |
|
1597 |
1600 |
# Final classification: Calculate thresholds by calculating the quantiles of the background and compare the real values to the background
|
1598 |
1601 |
if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l) | forceRerun) {
|
1599 |
1602 |
GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l =
|
1600 |
1603 |
.calculate_classificationThresholds(.asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background),
|
1601 |
|
- GRN@config$parameters)
|
|
1604 |
+ GRN@config$parameters)
|
1602 |
1605 |
}
|
1603 |
1606 |
|
1604 |
1607 |
if (is.null(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification) | forceRerun) {
|
... |
... |
@@ -1627,9 +1630,9 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
|
1627 |
1630 |
fileCur = paste0(outputFolder, .getOutputFileName("plot_class_density"), "_", connectionTypeCur, suffixFile)
|
1628 |
1631 |
if (!file.exists(fileCur) | forceRerun) {
|
1629 |
1632 |
.plot_density(.asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_foreground),
|
1630 |
|
- .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background),
|
1631 |
|
- corMethod,
|
1632 |
|
- fileCur, width = 5, height = 5)
|
|
1633 |
+ .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor_background),
|
|
1634 |
+ corMethod,
|
|
1635 |
+ fileCur, width = 5, height = 5)
|
1633 |
1636 |
} else {
|
1634 |
1637 |
futile.logger::flog.info(paste0(" File ", fileCur, " already exists, not overwriting since forceRerun = FALSE"))
|
1635 |
1638 |
}
|
... |
... |
@@ -1649,17 +1652,19 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
|
1649 |
1652 |
|
1650 |
1653 |
fileCur = paste0(outputFolder, .getOutputFileName("plot_class_densityClass"), "_", connectionTypeCur, suffixFile)
|
1651 |
1654 |
if (!file.exists(fileCur) | forceRerun) {
|
|
1655 |
+
|
|
1656 |
+ TF_peak_cor = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_peak_cor
|
1652 |
1657 |
peak_TF_overlapCur.df = .filterSortAndShuffle_peakTF_overlapTable(GRN, permutationCur, TF_peak_cor)
|
1653 |
1658 |
.plot_heatmapAR(TF.peakMatrix.df = peak_TF_overlapCur.df,
|
1654 |
|
- HOCOMOCO_mapping.df.exp = GRN@data$TFs$translationTable %>% dplyr::mutate(TF = TF.name),
|
1655 |
|
- sort.cor.m = TF_peak_cor,
|
1656 |
|
- par.l = GRN@config$parameters,
|
1657 |
|
- corMethod = corMethod,
|
1658 |
|
- median.cor.tfs = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground),
|
1659 |
|
- median.cor.tfs.non = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background),
|
1660 |
|
- act.rep.thres.l = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l,
|
1661 |
|
- finalClassification = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification,
|
1662 |
|
- file = fileCur, width = 8, height = 15)
|
|
1659 |
+ HOCOMOCO_mapping.df.exp = GRN@data$TFs$translationTable %>% dplyr::mutate(TF = TF.name),
|
|
1660 |
+ sort.cor.m = TF_peak_cor,
|
|
1661 |
+ par.l = GRN@config$parameters,
|
|
1662 |
+ corMethod = corMethod,
|
|
1663 |
+ median.cor.tfs = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_foreground),
|
|
1664 |
+ median.cor.tfs.non = .asMatrixFromSparse(GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF_cor_median_background),
|
|
1665 |
+ act.rep.thres.l = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$act.rep.thres.l,
|
|
1666 |
+ finalClassification = GRN@data$TFs$classification[[permIndex]] [[connectionTypeCur]]$TF.classification,
|
|
1667 |
+ file = fileCur, width = 8, height = 15)
|
1663 |
1668 |
} else {
|
1664 |
1669 |
futile.logger::flog.info(paste0(" File ", fileCur, " already exists, not overwriting since forceRerun = FALSE"))
|
1665 |
1670 |
}
|
... |
... |
@@ -1725,7 +1730,7 @@ AR_classification_wrapper<- function (GRN, significanceThreshold_Wilcoxon = 0.05
|
1725 |
1730 |
#' @examples
|
1726 |
1731 |
#' # See the Workflow vignette on the GRaNIE website for examples
|
1727 |
1732 |
#' GRN = loadExampleObject()
|
1728 |
|
-#' GRN = addConnections_TF_peak(GRN, forceRerun = FALSE)
|
|
1733 |
+#' GRN = addConnections_TF_peak(GRN, outputFolder = ".", forceRerun = FALSE)
|
1729 |
1734 |
#' @export
|
1730 |
1735 |
addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails = FALSE, outputFolder = NULL,
|
1731 |
1736 |
corMethod = "pearson",
|
... |
... |
@@ -1734,9 +1739,9 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1734 |
1739 |
maxFDRToStore = 0.3,
|
1735 |
1740 |
useGCCorrection = FALSE, percBackground_size = 75, percBackground_resample = TRUE,
|
1736 |
1741 |
forceRerun = FALSE) {
|
1737 |
|
-
|
|
1742 |
+
|
1738 |
1743 |
GRN = .addFunctionLogToObject(GRN)
|
1739 |
|
-
|
|
1744 |
+
|
1740 |
1745 |
checkmate::assertClass(GRN, "GRN")
|
1741 |
1746 |
checkmate::assertFlag(plotDiagnosticPlots)
|
1742 |
1747 |
checkmate::assertFlag(plotDetails)
|
... |
... |
@@ -1744,18 +1749,18 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1744 |
1749 |
|
1745 |
1750 |
GRN = .checkAndUpdateConnectionTypes(GRN) # For compatibility with older versions
|
1746 |
1751 |
checkmate::assertSubset(connectionTypes, GRN@config$TF_peak_connectionTypes)
|
1747 |
|
-
|
|
1752 |
+
|
1748 |
1753 |
checkmate::assertLogical(removeNegativeCorrelation, any.missing = FALSE, len = length(connectionTypes))
|
1749 |
1754 |
|
1750 |
1755 |
|
1751 |
1756 |
#checkmate::assert(checkmate::checkSubset(add_TFActivity, c("none", "calculate", names(slot(GRN, "data")[["TFs"]]))))
|
1752 |
|
-
|
|
1757 |
+
|
1753 |
1758 |
checkmate::assertNumber(maxFDRToStore, lower = 0, upper = 1)
|
1754 |
1759 |
checkmate::assertFlag(useGCCorrection)
|
1755 |
1760 |
checkmate::assertNumber(percBackground_size, lower = 0, upper = 100)
|
1756 |
1761 |
checkmate::assertFlag(percBackground_resample)
|
1757 |
1762 |
checkmate::assertFlag(forceRerun)
|
1758 |
|
-
|
|
1763 |
+
|
1759 |
1764 |
|
1760 |
1765 |
if (is.null(GRN@connections$TF_peaks) | forceRerun) {
|
1761 |
1766 |
|
... |
... |
@@ -1763,12 +1768,12 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1763 |
1768 |
|
1764 |
1769 |
GRN@config$parameters$corMethod_TF_Peak = corMethod
|
1765 |
1770 |
GRN@config$parameters$useGCCorrection = useGCCorrection
|
1766 |
|
-
|
|
1771 |
+
|
1767 |
1772 |
if (is.null(GRN@data$TFs$TF_peak_overlap)) {
|
1768 |
|
- message = paste0("Could not find peak - TF matrix. Run the function overlapPeaksAndTFBS first / again")
|
1769 |
|
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
|
|
1773 |
+ message = paste0("Could not find peak - TF matrix. Run the function overlapPeaksAndTFBS first / again")
|
|
1774 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
|
1770 |
1775 |
}
|
1771 |
|
-
|
|
1776 |
+
|
1772 |
1777 |
for (permutationCur in 0:.getMaxPermutation(GRN)) {
|
1773 |
1778 |
|
1774 |
1779 |
futile.logger::flog.info(paste0("\n", .getPermStr(permutationCur), "\n"))
|
... |
... |
@@ -1792,11 +1797,11 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1792 |
1797 |
|
1793 |
1798 |
} #end for each permutation
|
1794 |
1799 |
|
1795 |
|
-
|
|
1800 |
+
|
1796 |
1801 |
if (plotDiagnosticPlots) {
|
1797 |
|
-
|
1798 |
|
- plotDiagnosticPlots_TFPeaks(GRN, outputFolder = outputFolder, plotDetails = FALSE, forceRerun = forceRerun)
|
1799 |
|
-
|
|
1802 |
+
|
|
1803 |
+ plotDiagnosticPlots_TFPeaks(GRN, outputFolder = outputFolder, plotDetails = FALSE, forceRerun = forceRerun)
|
|
1804 |
+
|
1800 |
1805 |
}
|
1801 |
1806 |
|
1802 |
1807 |
}
|
... |
... |
@@ -1810,10 +1815,10 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1810 |
1815 |
removeNegativeCorrelation, maxFDRToStore = 0.3 , percBackground_size = 75, percBackground_resample = TRUE, plotDetails = FALSE) {
|
1811 |
1816 |
|
1812 |
1817 |
start = Sys.time()
|
1813 |
|
-
|
|
1818 |
+
|
1814 |
1819 |
if (plotDetails) {
|
1815 |
|
- message = "Plotting details is not supported currently. Set plotDetails = FALSE."
|
1816 |
|
- .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
|
|
1820 |
+ message = "Plotting details is not supported currently. Set plotDetails = FALSE."
|
|
1821 |
+ .checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
|
1817 |
1822 |
}
|
1818 |
1823 |
|
1819 |
1824 |
peak_TF_overlap.df= .filterSortAndShuffle_peakTF_overlapTable(GRN, perm)
|
... |
... |
@@ -1831,17 +1836,17 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1831 |
1836 |
|
1832 |
1837 |
futile.logger::flog.info(paste0("Calculate TF-peak links for connection type ", connectionTypeCur))
|
1833 |
1838 |
start2 = Sys.time()
|
1834 |
|
-
|
|
1839 |
+
|
1835 |
1840 |
if (connectionTypeCur == "expression") {
|
1836 |
1841 |
|
1837 |
1842 |
counts_connectionTypeCur = getCounts(GRN, type = "rna", norm = TRUE, permuted = as.logical(perm))
|
1838 |
|
-
|
|
1843 |
+
|
1839 |
1844 |
} else {
|
1840 |
1845 |
|
1841 |
1846 |
# Keep only Ensembl ID here
|
1842 |
1847 |
counts_connectionTypeCur = GRN@data$TFs[[connectionTypeCur]] %>%
|
1843 |
1848 |
dplyr::select(-TF.name)
|
1844 |
|
-
|
|
1849 |
+
|
1845 |
1850 |
}
|
1846 |
1851 |
|
1847 |
1852 |
futile.logger::flog.info(paste0(" Correlate ", connectionTypeCur, " and peak counts"))
|
... |
... |
@@ -1872,7 +1877,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1872 |
1877 |
sort.cor.m.sort<-peaksCor.m[,colnames(peak_TF_overlap.df)]
|
1873 |
1878 |
|
1874 |
1879 |
stopifnot(identical(colnames(sort.cor.m.sort), colnames(peak_TF_overlap.df)))
|
1875 |
|
-
|
|
1880 |
+
|
1876 |
1881 |
futile.logger::flog.info(paste0(" Compute FDR for each TF. This may take a while..."))
|
1877 |
1882 |
|
1878 |
1883 |
pb <- progress::progress_bar$new(total = length(allTF))
|
... |
... |
@@ -1880,9 +1885,9 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1880 |
1885 |
peaksFiltered = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!isFiltered)
|
1881 |
1886 |
|
1882 |
1887 |
if (!useGCCorrection) {
|
1883 |
|
- minPerc = 100
|
1884 |
|
- # Since we do not control for this, we set it to NA
|
1885 |
|
- background_match_success = NA
|
|
1888 |
+ minPerc = 100
|
|
1889 |
+ # Since we do not control for this, we set it to NA
|
|
1890 |
+ background_match_success = NA
|
1886 |
1891 |
}
|
1887 |
1892 |
|
1888 |
1893 |
for (TFCur in allTF) {
|
... |
... |
@@ -1895,7 +1900,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1895 |
1900 |
|
1896 |
1901 |
tp <- sort.cor.m.sort[overlapYes, TFCur]
|
1897 |
1902 |
n_tp = length(tp)
|
1898 |
|
-
|
|
1903 |
+
|
1899 |
1904 |
peaksForeground = peaksFiltered %>% dplyr::slice(overlapYes)
|
1900 |
1905 |
peaksBackground = peaksBackgroundGC = peaksFiltered %>% dplyr::slice(overlapNo)
|
1901 |
1906 |
|
... |
... |
@@ -1906,138 +1911,138 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
1906 |
1911 |
#start = Sys.time()
|
1907 |
1912 |
# GC-adjust background and select background regions according to foreground
|
1908 |
1913 |
if (useGCCorrection) {
|
|
1914 |
+
|
|
1915 |
+ fp_orig <- sort.cor.m.sort[overlapNo, TFCur]
|
|
1916 |
+ n_fp_orig = length(fp_orig)
|
|
1917 |
+
|
|
1918 |
+ # Get GC info from those peaks from the foreground
|
|
1919 |
+ GC_classes_foreground.df = peaksForeground %>%
|
|
1920 |
+ dplyr::group_by(GC_class) %>%
|
|
1921 |
+ dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(peak_width), peak_width_sd = sd(peak_width)) %>%
|
|
1922 |
+ dplyr::ungroup() %>%
|
|
1923 |
+ tidyr::complete(GC_class, fill = list(n = 0)) %>%
|
|
1924 |
+ dplyr::mutate(n_rel = .data$n / nPeaksForeground, type = "foreground") %>%
|
|
1925 |
+ dplyr::arrange(dplyr::desc(n_rel))
|
|
1926 |
+
|
|
1927 |
+ GC_classes_background.df = peaksBackground %>%
|
|
1928 |
+ dplyr::group_by(GC_class) %>%
|
|
1929 |
+ dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(peak_width), peak_width_sd = sd(peak_width)) %>%
|
|
1930 |
+ dplyr::ungroup() %>%
|
|
1931 |
+ tidyr::complete(GC_class, fill = list(n = 0)) %>%
|
|
1932 |
+ dplyr::mutate(n_rel = .data$n / nPeaksBackground, type = "background_orig")
|
|
1933 |
+
|
|
1934 |
+
|
|
1935 |
+
|
|
1936 |
+ background_match_success = TRUE
|
|
1937 |
+
|
|
1938 |
+ if (!is.null(percBackground_size)) {
|
|
1939 |
+ minPerc = percBackground_size
|
|
1940 |
+ } else {
|
1909 |
1941 |
|
1910 |
|
- fp_orig <- sort.cor.m.sort[overlapNo, TFCur]
|
1911 |
|
- n_fp_orig = length(fp_orig)
|
1912 |
|
-
|
1913 |
|
- # Get GC info from those peaks from the foreground
|
1914 |
|
- GC_classes_foreground.df = peaksForeground %>%
|
1915 |
|
- dplyr::group_by(GC_class) %>%
|
1916 |
|
- dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(peak_width), peak_width_sd = sd(peak_width)) %>%
|
1917 |
|
- dplyr::ungroup() %>%
|
1918 |
|
- tidyr::complete(GC_class, fill = list(n = 0)) %>%
|
1919 |
|
- dplyr::mutate(n_rel = .data$n / nPeaksForeground, type = "foreground") %>%
|
1920 |
|
- dplyr::arrange(dplyr::desc(n_rel))
|
1921 |
|
-
|
1922 |
|
- GC_classes_background.df = peaksBackground %>%
|
1923 |
|
- dplyr::group_by(GC_class) %>%
|
1924 |
|
- dplyr::summarise(n= dplyr::n(), peak_width_mean = mean(peak_width), peak_width_sd = sd(peak_width)) %>%
|
1925 |
|
- dplyr::ungroup() %>%
|
1926 |
|
- tidyr::complete(GC_class, fill = list(n = 0)) %>%
|
1927 |
|
- dplyr::mutate(n_rel = .data$n / nPeaksBackground, type = "background_orig")
|
1928 |
|
-
|
|
1942 |
+ # Find a matching background of maximal size
|
|
1943 |
+ minPerc= min(GC_class.all$maxSizeBackground) / nPeaksBackground
|
1929 |
1944 |
|
|
1945 |
+ threshold_percentage = 0.05
|
|
1946 |
+ minPerc = .findMaxBackgroundSize(GC_classes_foreground.df, GC_classes_background.df, peaksBackground, threshold_percentage = threshold_percentage)
|
1930 |
1947 |
|
1931 |
|
- background_match_success = TRUE
|
|
1948 |
+ if (minPerc == 0) {
|
|
1949 |
+ #futile.logger::flog.warn(paste0(" Mimicking the foreground failed for TF ", TFCur, ". The background will only be approximated as good as possible using 5% of the peaks."))
|
|
1950 |
+ minPerc = 5
|
|
1951 |
+ background_match_success = FALSE
|
|
1952 |
+ }
|
|
1953 |
+ }
|
|
1954 |
+
|
|
1955 |
+ targetNoPeaks = minPerc/100 * nPeaksBackground
|
|
1956 |
+
|
|
1957 |
+ # TODO: Make 1000 a parameter stored somewhere
|
|
1958 |
+ # Ensure a minimum no of points in background, even if this sacrifices the mimicking of the distributions
|
|
1959 |
+ if (targetNoPeaks < 1000) {
|
1932 |
1960 |
|
1933 |
1961 |
if (!is.null(percBackground_size)) {
|
1934 |
|
- minPerc = percBackground_size
|
|
1962 |
+ #futile.logger::flog.warn(paste0("Number of peaks in background is smaller than 1000 for TF ", TFCur, ". Increasing in steps of 5% until a value > 1000 is found.This warning results from a too low value for the parameter percBackground_size"))
|
1935 |
1963 |
} else {
|
1936 |
|
-
|
1937 |
|
- # Find a matching background of maximal size
|
1938 |
|
- minPerc= min(GC_class.all$maxSizeBackground) / nPeaksBackground
|
1939 |
|
-
|
1940 |
|
- threshold_percentage = 0.05
|
1941 |
|
- minPerc = .findMaxBackgroundSize(GC_classes_foreground.df, GC_classes_background.df, peaksBackground, threshold_percentage = threshold_percentage)
|
1942 |
|
-
|
1943 |
|
- if (minPerc == 0) {
|
1944 |
|
- #futile.logger::flog.warn(paste0(" Mimicking the foreground failed for TF ", TFCur, ". The background will only be approximated as good as possible using 5% of the peaks."))
|
1945 |
|
- minPerc = 5
|
1946 |
|
- background_match_success = FALSE
|
1947 |
|
- }
|
|
1964 |
+ background_match_success = FALSE
|
1948 |
1965 |
}
|
1949 |
1966 |
|
1950 |
|
- targetNoPeaks = minPerc/100 * nPeaksBackground
|
1951 |
|
-
|
1952 |
|
- # TODO: Make 1000 a parameter stored somewhere
|
1953 |
|
- # Ensure a minimum no of points in background, even if this sacrifices the mimicking of the distributions
|
1954 |
|
- if (targetNoPeaks < 1000) {
|
1955 |
|
-
|
1956 |
|
- if (!is.null(percBackground_size)) {
|
1957 |
|
- #futile.logger::flog.warn(paste0("Number of peaks in background is smaller than 1000 for TF ", TFCur, ". Increasing in steps of 5% until a value > 1000 is found.This warning results from a too low value for the parameter percBackground_size"))
|
1958 |
|
- } else {
|
1959 |
|
- background_match_success = FALSE
|
1960 |
|
- }
|
1961 |
|
-
|
1962 |
|
- targetNoPeaksNew = targetNoPeaks
|
1963 |
|
- while(targetNoPeaksNew < 1000) {
|
1964 |
|
- minPerc = minPerc + 5
|
1965 |
|
- targetNoPeaksNew = minPerc/100 * nPeaksBackground
|
1966 |
|
- }
|
1967 |
|
-
|
1968 |
|
-
|
|
1967 |
+ targetNoPeaksNew = targetNoPeaks
|
|
1968 |
+ while(targetNoPeaksNew < 1000) {
|
|
1969 |
+ minPerc = minPerc + 5
|
|
1970 |
+ targetNoPeaksNew = minPerc/100 * nPeaksBackground
|
1969 |
1971 |
}
|
1970 |
1972 |
|
1971 |
|
- # Add sel. minPerc to table and calculate required frequencies
|
1972 |
|
-
|
1973 |
|
- GC_classes_all.df = dplyr::full_join(GC_classes_foreground.df, GC_classes_background.df, suffix = c(".fg",".bg"), by = "GC_class") %>%
|
1974 |
|
- dplyr::mutate(maxSizeBackground = n.bg / n_rel.fg,
|
1975 |
|
- n.bg.needed = floor(n_rel.fg * targetNoPeaks),
|
1976 |
|
- n.bg.needed.perc = n.bg / n.bg.needed)
|
1977 |
1973 |
|
|
1974 |
+ }
|
|
1975 |
+
|
|
1976 |
+ # Add sel. minPerc to table and calculate required frequencies
|
|
1977 |
+
|
|
1978 |
+ GC_classes_all.df = dplyr::full_join(GC_classes_foreground.df, GC_classes_background.df, suffix = c(".fg",".bg"), by = "GC_class") %>%
|
|
1979 |
+ dplyr::mutate(maxSizeBackground = n.bg / n_rel.fg,
|
|
1980 |
+ n.bg.needed = floor(n_rel.fg * targetNoPeaks),
|
|
1981 |
+ n.bg.needed.perc = n.bg / n.bg.needed)
|
|
1982 |
+
|
|
1983 |
+
|
|
1984 |
+ #futile.logger::flog.info(paste0( " GC-adjustment: Randomly select a total of ", round(targetNoPeaks,0),
|
|
1985 |
+ # " peaks (", minPerc, " %) from the background (out of ", nrow(peaksBackground),
|
|
1986 |
+ # " overall) in a GC-binwise fashion to mimick the foreground distribution"))
|
|
1987 |
+
|
|
1988 |
+ # Now we know the percentage, lets select an appropriate background
|
|
1989 |
+ # Sample peaks from background for each GC-bin specifically
|
|
1990 |
+ peakIDsSel = c()
|
|
1991 |
+ for (i in seq_len(nrow(GC_classes_foreground.df))) {
|
1978 |
1992 |
|
1979 |
|
- #futile.logger::flog.info(paste0( " GC-adjustment: Randomly select a total of ", round(targetNoPeaks,0),
|
1980 |
|
- # " peaks (", minPerc, " %) from the background (out of ", nrow(peaksBackground),
|
1981 |
|
- # " overall) in a GC-binwise fashion to mimick the foreground distribution"))
|
|
1993 |
+ peaksBackgroundGCCur = peaksBackground %>% dplyr::filter(GC_class == GC_classes_foreground.df$GC_class[i])
|
1982 |
1994 |
|
1983 |
|
- # Now we know the percentage, lets select an appropriate background
|
1984 |
|
- # Sample peaks from background for each GC-bin specifically
|
1985 |
|
- peakIDsSel = c()
|
1986 |
|
- for (i in seq_len(nrow(GC_classes_foreground.df))) {
|
1987 |
|
-
|
1988 |
|
- peaksBackgroundGCCur = peaksBackground %>% dplyr::filter(GC_class == GC_classes_foreground.df$GC_class[i])
|
1989 |
|
-
|
1990 |
|
- if (nrow( peaksBackgroundGCCur) == 0) {
|
1991 |
|
- next
|
1992 |
|
- }
|
1993 |
|
-
|
1994 |
|
- #Select the minimum, which for low % classes is smaller than the required number to mimic the foreground 100%
|
1995 |
|
- if (percBackground_resample) {
|
1996 |
|
- nPeaksCur = GC_classes_all.df$n.bg.needed[i]
|
1997 |
|
- } else {
|
1998 |
|
- nPeaksCur = min(GC_classes_all.df$n.bg.needed[i], nrow(peaksBackgroundGCCur))
|
1999 |
|
- }
|
2000 |
|
-
|
2001 |
|
- if (GC_classes_all.df$n.bg.needed[i] > nrow(peaksBackgroundGCCur)) {
|
2002 |
|
- peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = percBackground_resample) %>% dplyr::pull(peakID))
|
2003 |
|
- } else {
|
2004 |
|
- peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = FALSE) %>% dplyr::pull(peakID))
|
2005 |
|
- }
|
2006 |
|
- # Take a sample from the background, and record the row numbers
|
2007 |
|
-
|
|
1995 |
+ if (nrow( peaksBackgroundGCCur) == 0) {
|
|
1996 |
+ next
|
2008 |
1997 |
}
|
2009 |
1998 |
|
2010 |
|
- # We cannot simply select now the peakIDs, as some peaks may be present multiple times
|
2011 |
|
- #peaksBackgroundGC = peaksBackground %>% dplyr::filter(peakID %in% peakIDsSel)
|
2012 |
|
- peaksBackgroundGC = peaksBackground[match(peakIDsSel, peaksBackground$peakID),]
|
2013 |
|
-
|
2014 |
|
-
|
2015 |
|
- if (is.null(plots_GC.l[[TFCur]])) {
|
2016 |
|
- plots_GC.l[[TFCur]] = .generateTF_GC_diagnosticPlots(TFCur, GC_classes_foreground.df, GC_classes_background.df, GC_classes_all.df, peaksForeground, peaksBackground, peaksBackgroundGC)
|
|
1999 |
+ #Select the minimum, which for low % classes is smaller than the required number to mimic the foreground 100%
|
|
2000 |
+ if (percBackground_resample) {
|
|
2001 |
+ nPeaksCur = GC_classes_all.df$n.bg.needed[i]
|
|
2002 |
+ } else {
|
|
2003 |
+ nPeaksCur = min(GC_classes_all.df$n.bg.needed[i], nrow(peaksBackgroundGCCur))
|
2017 |
2004 |
}
|
2018 |
2005 |
|
2019 |
|
-
|
2020 |
|
- # Select the rows by their peak IDs
|
2021 |
|
- fp = sort.cor.m.sort[peakIDsSel, TFCur]
|
|
2006 |
+ if (GC_classes_all.df$n.bg.needed[i] > nrow(peaksBackgroundGCCur)) {
|
|
2007 |
+ peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = percBackground_resample) %>% dplyr::pull(peakID))
|
|
2008 |
+ } else {
|
|
2009 |
+ peakIDsSel = c(peakIDsSel, peaksBackgroundGCCur %>% dplyr::sample_n(nPeaksCur, replace = FALSE) %>% dplyr::pull(peakID))
|
|
2010 |
+ }
|
|
2011 |
+ # Take a sample from the background, and record the row numbers
|
2022 |
2012 |
|
2023 |
|
- fp_orig = sort.cor.m.sort[overlapNo, TFCur]
|
2024 |
|
- n_fp_orig = length(fp_orig)
|
2025 |
|
-
|
|
2013 |
+ }
|
|
2014 |
+
|
|
2015 |
+ # We cannot simply select now the peakIDs, as some peaks may be present multiple times
|
|
2016 |
+ #peaksBackgroundGC = peaksBackground %>% dplyr::filter(peakID %in% peakIDsSel)
|
|
2017 |
+ peaksBackgroundGC = peaksBackground[match(peakIDsSel, peaksBackground$peakID),]
|
|
2018 |
+
|
|
2019 |
+
|
|
2020 |
+ if (is.null(plots_GC.l[[TFCur]])) {
|
|
2021 |
+ plots_GC.l[[TFCur]] = .generateTF_GC_diagnosticPlots(TFCur, GC_classes_foreground.df, GC_classes_background.df, GC_classes_all.df, peaksForeground, peaksBackground, peaksBackgroundGC)
|
|
2022 |
+ }
|
|
2023 |
+
|
|
2024 |
+
|
|
2025 |
+ # Select the rows by their peak IDs
|
|
2026 |
+ fp = sort.cor.m.sort[peakIDsSel, TFCur]
|
|
2027 |
+
|
|
2028 |
+ fp_orig = sort.cor.m.sort[overlapNo, TFCur]
|
|
2029 |
+ n_fp_orig = length(fp_orig)
|
|
2030 |
+
|
2026 |
2031 |
} else {
|
2027 |
|
- # TODO: Redudnant so far in this case
|
2028 |
|
- fp = fp_orig = sort.cor.m.sort[overlapNo, TFCur]
|
2029 |
|
- n_fp_orig = length(fp_orig)
|
|
2032 |
+ # TODO: Redudnant so far in this case
|
|
2033 |
+ fp = fp_orig = sort.cor.m.sort[overlapNo, TFCur]
|
|
2034 |
+ n_fp_orig = length(fp_orig)
|
2030 |
2035 |
}
|
2031 |
2036 |
|
2032 |
2037 |
n_fp = length(fp)
|
2033 |
|
-
|
|
2038 |
+
|
2034 |
2039 |
cor.peak.tf = tibble::tibble(peak.ID = rownames(sort.cor.m.sort)[overlapYes],
|
2035 |
|
- TF_peak.r = sort.cor.m.sort[overlapYes, TFCur],
|
2036 |
|
- TF.name = as.factor(TFCur),
|
2037 |
|
- TF_peak.connectionType = as.factor(connectionTypeCur))
|
|
2040 |
+ TF_peak.r = sort.cor.m.sort[overlapYes, TFCur],
|
|
2041 |
+ TF.name = as.factor(TFCur),
|
|
2042 |
+ TF_peak.connectionType = as.factor(connectionTypeCur))
|
2038 |
2043 |
|
2039 |
2044 |
#val.sign = (median(tp) - median(fp))
|
2040 |
|
- # val.sign_orig = (median(tp) - median(fp_orig))
|
|
2045 |
+ # val.sign_orig = (median(tp) - median(fp_orig))
|
2041 |
2046 |
|
2042 |
2047 |
#.printExecutionTime(start, "Interval 2: ")
|
2043 |
2048 |
|
... |
... |
@@ -2068,26 +2073,26 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
2068 |
2073 |
cor.peak.tf$TF_peak.r_bin <- as.character(cut(cor.peak.tf$TF_peak.r, breaks = stepsCur, right = rightOpen, include.lowest = TRUE))
|
2069 |
2074 |
#.printExecutionTime(start, "Interval 3a: ")
|
2070 |
2075 |
#start = Sys.time()
|
2071 |
|
-
|
|
2076 |
+
|
2072 |
2077 |
|
2073 |
2078 |
i = 0
|
2074 |
2079 |
for (thres in stepsCur) {
|
2075 |
|
- i = i + 1
|
|
2080 |
+ i = i + 1
|
|
2081 |
+
|
|
2082 |
+ # na.rm = TRUE for all sums here to make sure NAs will not cause a problem
|
|
2083 |
+ if (directionCur == "pos") {
|
2076 |
2084 |
|
2077 |
|
- # na.rm = TRUE for all sums here to make sure NAs will not cause a problem
|
2078 |
|
- if (directionCur == "pos") {
|
2079 |
|
-
|
2080 |
|
- n_tp2.vec[i] = sum(tp>=thres, na.rm = TRUE)
|
2081 |
|
- n_fp2.vec[i] = sum(fp>=thres, na.rm = TRUE)
|
2082 |
|
- n_fp2_orig.vec[i] = sum(fp_orig>=thres, na.rm = TRUE)
|
2083 |
|
-
|
2084 |
|
- } else {
|
2085 |
|
-
|
2086 |
|
- n_tp2.vec[i] = sum(tp<thres, na.rm = TRUE)
|
2087 |
|
- n_fp2.vec[i] = sum(fp<thres, na.rm = TRUE)
|
2088 |
|
- n_fp2_orig.vec[i] = sum(fp_orig<thres, na.rm = TRUE)
|
2089 |
|
- }
|
|
2085 |
+ n_tp2.vec[i] = sum(tp>=thres, na.rm = TRUE)
|
|
2086 |
+ n_fp2.vec[i] = sum(fp>=thres, na.rm = TRUE)
|
|
2087 |
+ n_fp2_orig.vec[i] = sum(fp_orig>=thres, na.rm = TRUE)
|
|
2088 |
+
|
|
2089 |
+ } else {
|
2090 |
2090 |
|
|
2091 |
+ n_tp2.vec[i] = sum(tp<thres, na.rm = TRUE)
|
|
2092 |
+ n_fp2.vec[i] = sum(fp<thres, na.rm = TRUE)
|
|
2093 |
+ n_fp2_orig.vec[i] = sum(fp_orig<thres, na.rm = TRUE)
|
|
2094 |
+ }
|
|
2095 |
+
|
2091 |
2096 |
}
|
2092 |
2097 |
|
2093 |
2098 |
#.printExecutionTime(start, "Interval 3b_new: ")
|
... |
... |
@@ -2097,30 +2102,30 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
2097 |
2102 |
# The maximum number is then identical to the maximum for the true positives
|
2098 |
2103 |
n_fp2_norm.vec = (n_fp2.vec/(n_fp/n_tp))
|
2099 |
2104 |
n_fp2_orig_norm.vec = (n_fp2_orig.vec/(n_fp_orig/n_tp))
|
2100 |
|
-
|
|
2105 |
+
|
2101 |
2106 |
#TODO: Decide for a variant. +1 for raw or unnormalized values?
|
2102 |
|
-
|
|
2107 |
+
|
2103 |
2108 |
|
2104 |
2109 |
fdr.curve = tibble::tibble(
|
2105 |
|
- TF_peak.r_bin2 = stepsCur,
|
2106 |
|
- tpvalue = n_tp2.vec,
|
2107 |
|
-
|
2108 |
|
- fpvalue = n_fp2.vec,
|
2109 |
|
- fpvalue_orig = n_fp2_orig.vec,
|
2110 |
|
-
|
2111 |
|
- fpvalue_norm = n_fp2_norm.vec,
|
2112 |
|
- fpvalue_norm_orig = n_fp2_orig_norm.vec,
|
2113 |
|
-
|
2114 |
|
- TF_peak.fdr_orig = (n_fp2_orig_norm.vec) / (n_fp2_orig_norm.vec + n_tp2.vec),
|
2115 |
|
- TF_peak.fdr = (n_fp2_norm.vec) / (n_fp2_norm.vec + n_tp2.vec),
|
2116 |
|
- # TF_peak.fdr_orig = (n_fp2_orig_norm.vec + 1) / (n_fp2_orig_norm.vec + n_tp2.vec + 1),
|
2117 |
|
- # TF_peak.fdr = (n_fp2_norm.vec +1) / (n_fp2_norm.vec + n_tp2.vec + 1),
|
2118 |
|
- #
|
2119 |
|
- TF_peak.fdr_direction = directionCur,
|
2120 |
|
- TF_peak.r_bin =
|
2121 |
|
- as.character(cut(TF_peak.r_bin2, breaks = stepsCur, right = rightOpen, include.lowest = TRUE))
|
|
2110 |
+ TF_peak.r_bin2 = stepsCur,
|
|
2111 |
+ tpvalue = n_tp2.vec,
|
|
2112 |
+
|
|
2113 |
+ fpvalue = n_fp2.vec,
|
|
2114 |
+ fpvalue_orig = n_fp2_orig.vec,
|
|
2115 |
+
|
|
2116 |
+ fpvalue_norm = n_fp2_norm.vec,
|
|
2117 |
+ fpvalue_norm_orig = n_fp2_orig_norm.vec,
|
|
2118 |
+
|
|
2119 |
+ TF_peak.fdr_orig = (n_fp2_orig_norm.vec) / (n_fp2_orig_norm.vec + n_tp2.vec),
|
|
2120 |
+ TF_peak.fdr = (n_fp2_norm.vec) / (n_fp2_norm.vec + n_tp2.vec),
|
|
2121 |
+ # TF_peak.fdr_orig = (n_fp2_orig_norm.vec + 1) / (n_fp2_orig_norm.vec + n_tp2.vec + 1),
|
|
2122 |
+ # TF_peak.fdr = (n_fp2_norm.vec +1) / (n_fp2_norm.vec + n_tp2.vec + 1),
|
|
2123 |
+ #
|
|
2124 |
+ TF_peak.fdr_direction = directionCur,
|
|
2125 |
+ TF_peak.r_bin =
|
|
2126 |
+ as.character(cut(TF_peak.r_bin2, breaks = stepsCur, right = rightOpen, include.lowest = TRUE))
|
2122 |
2127 |
)
|
2123 |
|
-
|
|
2128 |
+
|
2124 |
2129 |
# Derive connection summaries for all TF for both directions
|
2125 |
2130 |
# Get no. of connections per bin, here make sure to also include that have n = 0
|
2126 |
2131 |
|
... |
... |
@@ -2159,10 +2164,10 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
2159 |
2164 |
dplyr::rename(n_tp = tpvalue, n_fp = fpvalue, n_fp_norm = fpvalue_norm) %>%
|
2160 |
2165 |
dplyr::distinct() %>%
|
2161 |
2166 |
dplyr::arrange(TF_peak.r_bin)
|
2162 |
|
-
|
2163 |
|
-
|
|
2167 |
+
|
|
2168 |
+
|
2164 |
2169 |
# Collect data for additional QC plots before they are filtered
|
2165 |
|
-
|
|
2170 |
+
|
2166 |
2171 |
# Filter now high FDR connections to save space and time
|
2167 |
2172 |
# DISCARD other rows altogether
|
2168 |
2173 |
# Left join here is what we want, as we need this df only for "real" data
|
... |
... |
@@ -2171,81 +2176,81 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
2171 |
2176 |
dplyr::select(TF.name, TF_peak.r_bin, TF_peak.r, TF_peak.fdr,
|
2172 |
2177 |
TF_peak.fdr_orig, peak.ID, TF_peak.fdr_direction,
|
2173 |
2178 |
TF_peak.connectionType, tidyselect::contains("value"))
|
2174 |
|
-
|
2175 |
|
-
|
|
2179 |
+
|
|
2180 |
+
|
2176 |
2181 |
if (!plotDetails) {
|
2177 |
|
- tblFilt.df = dplyr::select(tblFilt.df, -tidyselect::contains("value"))
|
|
2182 |
+ tblFilt.df = dplyr::select(tblFilt.df, -tidyselect::contains("value"))
|
2178 |
2183 |
}
|
2179 |
2184 |
|
2180 |
2185 |
connections_all.l[[indexStr]] = tblFilt.df
|
2181 |
2186 |
|
2182 |
2187 |
#.printExecutionTime(start, "Interval 4: ")
|
2183 |
2188 |
#start = Sys.time()
|
2184 |
|
-
|
|
2189 |
+
|
2185 |
2190 |
} # end for directionCur in c("pos", "neg")
|
2186 |
|
-
|
|
2191 |
+
|
2187 |
2192 |
} # end for each TF
|
2188 |
|
-
|
|
2193 |
+
|
2189 |
2194 |
.printExecutionTime(start2, prefix = " ")
|
2190 |
2195 |
|
2191 |
2196 |
} # end for each connectionType
|
2192 |
2197 |
|
2193 |
|
-
|
|
2198 |
+
|
2194 |
2199 |
.printExecutionTime(start)
|
2195 |
2200 |
list(main = data.table::rbindlist(connections_all.l),
|
2196 |
2201 |
connectionStats = data.table::rbindlist(connectionStats_all.l),
|
2197 |
2202 |
plots_GC = plots_GC.l
|
2198 |
|
- )
|
|
2203 |
+ )
|
2199 |
2204 |
}
|
2200 |
2205 |
|
2201 |
2206 |
.findMaxBackgroundSize <- function (GC_classes_foreground.df, GC_classes_background.df, peaksBackground, threshold_percentage = 0.05) {
|
|
2207 |
+
|
|
2208 |
+ # Iterate over different background sizes
|
|
2209 |
+ minPerc = 0
|
|
2210 |
+ for (percCur in c(seq(100,10,-5),5)) {
|
2202 |
2211 |
|
2203 |
|
- # Iterate over different background sizes
|
2204 |
|
- minPerc = 0
|
2205 |
|
- for (percCur in c(seq(100,10,-5),5)) {
|
2206 |
|
-
|
2207 |
|
- if (minPerc > 0) next
|
2208 |
|
- targetNoPeaks = percCur/100 * nrow(peaksBackground)
|
2209 |
|
-
|
2210 |
|
- #futile.logger::flog.info(paste0("Percentage of background: ", percCur))
|
2211 |
|
-
|
2212 |
|
- #Check for each GC bin in the foreground, starting with the most abundant, whether we have enough background peaks to sample from
|
2213 |
|
-
|
2214 |
|
- # threshold_percentage: From which percentage of GC bin frequency from the foreground should the mimicking fail?
|
2215 |
|
- #The motivation is that very small bins that have no weight in the foreground will not cause a failure of the mimicking
|
2216 |
|
-
|
2217 |
|
-
|
2218 |
|
- for (i in seq_len(nrow(GC_classes_foreground.df))) {
|
2219 |
|
-
|
2220 |
|
- n_rel = GC_classes_foreground.df$n_rel[i]
|
2221 |
|
- GC_class.cur = GC_classes_foreground.df$GC_class[i]
|
2222 |
|
-
|
2223 |
|
- requiredNoPeaks = round(n_rel * targetNoPeaks, 0)
|
2224 |
|
- # Check in background
|
2225 |
|
- availableNoPeaks = GC_classes_background.df %>%
|
2226 |
|
- dplyr::filter(GC_class == GC_class.cur) %>%
|
2227 |
|
- dplyr::pull(.data$n)
|
2228 |
|
- #futile.logger::flog.info(paste0(" GC.class ", GC.class.cur, ": Required: ", requiredNoPeaks, ", available: ", availableNoPeaks))
|
2229 |
|
- if ( availableNoPeaks < requiredNoPeaks) {
|
2230 |
|
- #futile.logger::flog.info(paste0(" Not enough"))
|
2231 |
|
- }
|
2232 |
|
- if (availableNoPeaks < requiredNoPeaks & n_rel > threshold_percentage) {
|
2233 |
|
- #futile.logger::flog.info(paste0(" Mimicking distribution FAILED (GC class ", GC.class.cur, " could not be mimicked"))
|
2234 |
|
- break
|
2235 |
|
- }
|
2236 |
|
-
|
2237 |
|
- if (i == nrow(GC_classes_foreground.df)) {
|
2238 |
|
- minPerc = percCur
|
2239 |
|
- #futile.logger::flog.info(paste0("Found max. percentage of background that is able to mimick the foreground: ", percCur))
|
2240 |
|
-
|
2241 |
|
- }
|
2242 |
|
-
|
2243 |
|
- } # end of for (i in 1:nrow(GC_classes_foreground.df)) {
|
2244 |
|
-
|
|
2212 |
+ if (minPerc > 0) next
|
|
2213 |
+ targetNoPeaks = percCur/100 * nrow(peaksBackground)
|
|
2214 |
+
|
|
2215 |
+ #futile.logger::flog.info(paste0("Percentage of background: ", percCur))
|
|
2216 |
+
|
|
2217 |
+ #Check for each GC bin in the foreground, starting with the most abundant, whether we have enough background peaks to sample from
|
|
2218 |
+
|
|
2219 |
+ # threshold_percentage: From which percentage of GC bin frequency from the foreground should the mimicking fail?
|
|
2220 |
+ #The motivation is that very small bins that have no weight in the foreground will not cause a failure of the mimicking
|
|
2221 |
+
|
|
2222 |
+
|
|
2223 |
+ for (i in seq_len(nrow(GC_classes_foreground.df))) {
|
|
2224 |
+
|
|
2225 |
+ n_rel = GC_classes_foreground.df$n_rel[i]
|
|
2226 |
+ GC_class.cur = GC_classes_foreground.df$GC_class[i]
|
|
2227 |
+
|
|
2228 |
+ requiredNoPeaks = round(n_rel * targetNoPeaks, 0)
|
|
2229 |
+ # Check in background
|
|
2230 |
+ availableNoPeaks = GC_classes_background.df %>%
|
|
2231 |
+ dplyr::filter(GC_class == GC_class.cur) %>%
|
|
2232 |
+ dplyr::pull(.data$n)
|
|
2233 |
+ #futile.logger::flog.info(paste0(" GC.class ", GC.class.cur, ": Required: ", requiredNoPeaks, ", available: ", availableNoPeaks))
|
|
2234 |
+ if ( availableNoPeaks < requiredNoPeaks) {
|
|
2235 |
+ #futile.logger::flog.info(paste0(" Not enough"))
|
|
2236 |
+ }
|
|
2237 |
+ if (availableNoPeaks < requiredNoPeaks & n_rel > threshold_percentage) {
|
|
2238 |
+ #futile.logger::flog.info(paste0(" Mimicking distribution FAILED (GC class ", GC.class.cur, " could not be mimicked"))
|
|
2239 |
+ break
|
|
2240 |
+ }
|
|
2241 |
+
|
|
2242 |
+ if (i == nrow(GC_classes_foreground.df)) {
|
|
2243 |
+ minPerc = percCur
|
|
2244 |
+ #futile.logger::flog.info(paste0("Found max. percentage of background that is able to mimick the foreground: ", percCur))
|
2245 |
2245 |
|
2246 |
|
- } # end for all percentages
|
|
2246 |
+ }
|
|
2247 |
+
|
|
2248 |
+ } # end of for (i in 1:nrow(GC_classes_foreground.df)) {
|
2247 |
2249 |
|
2248 |
|
- minPerc
|
|
2250 |
+
|
|
2251 |
+ } # end for all percentages
|
|
2252 |
+
|
|
2253 |
+ minPerc
|
2249 |
2254 |
}
|
2250 |
2255 |
|
2251 |
2256 |
|
... |
... |
@@ -2271,7 +2276,7 @@ addConnections_TF_peak <- function (GRN, plotDiagnosticPlots = TRUE, plotDetails
|
2271 |
2276 |
#' @examples
|
2272 |
2277 |
#' # See the Workflow vignette on the GRaNIE website for examples
|
2273 |
2278 |
#' GRN = loadExampleObject()
|
2274 |
|
-#' GRN = addConnections_peak_gene(GRN, promoterRange = 10000, nCores = 2, forceRerun = FALSE)
|
|
2279 |
+#' GRN = addConnections_peak_gene(GRN, promoterRange = 10000, outputFolder = ".")
|
2275 |
2280 |
addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "pearson",
|
2276 |
2281 |
promoterRange = 250000, TADs = NULL,
|
2277 |
2282 |
nCores = 4,
|
... |
... |
@@ -2282,7 +2287,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2282 |
2287 |
forceRerun = FALSE) {
|
2283 |
2288 |
|
2284 |
2289 |
GRN = .addFunctionLogToObject(GRN)
|
2285 |
|
-
|
|
2290 |
+
|
2286 |
2291 |
checkmate::assertClass(GRN, "GRN")
|
2287 |
2292 |
checkmate::assertChoice(overlapTypeGene, c("TSS", "full"))
|
2288 |
2293 |
checkmate::assertChoice(corMethod, c("pearson", "spearman"))
|
... |
... |
@@ -2297,58 +2302,58 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2297 |
2302 |
|
2298 |
2303 |
# As this is independent of the underlying GRN, it has to be done only once
|
2299 |
2304 |
|
2300 |
|
- if (is.null(GRN@connections$peak_genes[["0"]]) | forceRerun) {
|
|
2305 |
+ if (is.null(GRN@connections$peak_genes[["0"]]) | forceRerun) {
|
|
2306 |
+
|
|
2307 |
+ GRN@config$parameters$promoterRange = promoterRange
|
|
2308 |
+ GRN@config$parameters$corMethod_peak_gene = corMethod
|
|
2309 |
+
|
|
2310 |
+
|
|
2311 |
+ GRN@connections$peak_genes = list()
|
|
2312 |
+
|
|
2313 |
+
|
|
2314 |
+ # Check which gene types are available for the particular genome annotation
|
|
2315 |
+ # Use all of them to collect statistics. Filtering can be done later
|
|
2316 |
+ gene.types = unique(GRN@annotation$genes$gene.type)
|
|
2317 |
+
|
|
2318 |
+
|
|
2319 |
+ for (permutationCur in 0:.getMaxPermutation(GRN)) {
|
2301 |
2320 |
|
2302 |
|
- GRN@config$parameters$promoterRange = promoterRange
|
2303 |
|
- GRN@config$parameters$corMethod_peak_gene = corMethod
|
|
2321 |
+ futile.logger::flog.info(paste0("\n", .getPermStr(permutationCur), "\n"))
|
2304 |
2322 |
|
2305 |
|
-
|
2306 |
|
- GRN@connections$peak_genes = list()
|
2307 |
|
-
|
|
2323 |
+ if (permutationCur == 0) {
|
|
2324 |
+ futile.logger::flog.info(paste0("Calculate peak-gene correlations for neighborhood size ", promoterRange))
|
|
2325 |
+ randomizePeakGeneConnections = FALSE
|
|
2326 |
+ } else {
|
|
2327 |
+ futile.logger::flog.info(paste0("Calculate random peak-gene correlations for neighborhood size ", promoterRange))
|
|
2328 |
+ randomizePeakGeneConnections = TRUE
|
|
2329 |
+ }
|
2308 |
2330 |
|
2309 |
|
- # Check which gene types are available for the particular genome annotation
|
2310 |
|
- # Use all of them to collect statistics. Filtering can be done later
|
2311 |
|
- gene.types = unique(GRN@annotation$genes$gene.type)
|
|
2331 |
+ GRN@connections$peak_genes[[as.character(permutationCur)]] =
|
|
2332 |
+ .calculatePeakGeneCorrelations(GRN, permutationCur,
|
|
2333 |
+ TADs = TADs,
|
|
2334 |
+ neighborhoodSize = promoterRange,
|
|
2335 |
+ gene.types = names(gene.types),
|
|
2336 |
+ corMethod = corMethod,
|
|
2337 |
+ randomizePeakGeneConnections = randomizePeakGeneConnections,
|
|
2338 |
+ overlapTypeGene,
|
|
2339 |
+ nCores = nCores,
|
|
2340 |
+ debugMode_nPlots = 0,
|
|
2341 |
+ addRobustRegression = addRobustRegression
|
|
2342 |
+ )
|
2312 |
2343 |
|
2313 |
|
-
|
2314 |
|
- for (permutationCur in 0:.getMaxPermutation(GRN)) {
|
2315 |
|
-
|
2316 |
|
- futile.logger::flog.info(paste0("\n", .getPermStr(permutationCur), "\n"))
|
2317 |
|
-
|
2318 |
|
- if (permutationCur == 0) {
|
2319 |
|
- futile.logger::flog.info(paste0("Calculate peak-gene correlations for neighborhood size ", promoterRange))
|
2320 |
|
- randomizePeakGeneConnections = FALSE
|
2321 |
|
- } else {
|
2322 |
|
- futile.logger::flog.info(paste0("Calculate random peak-gene correlations for neighborhood size ", promoterRange))
|
2323 |
|
- randomizePeakGeneConnections = TRUE
|
2324 |
|
- }
|
2325 |
|
-
|
2326 |
|
- GRN@connections$peak_genes[[as.character(permutationCur)]] =
|
2327 |
|
- .calculatePeakGeneCorrelations(GRN, permutationCur,
|
2328 |
|
- TADs = TADs,
|
2329 |
|
- neighborhoodSize = promoterRange,
|
2330 |
|
- gene.types = names(gene.types),
|
2331 |
|
- corMethod = corMethod,
|
2332 |
|
- randomizePeakGeneConnections = randomizePeakGeneConnections,
|
2333 |
|
- overlapTypeGene,
|
2334 |
|
- nCores = nCores,
|
2335 |
|
- debugMode_nPlots = 0,
|
2336 |
|
- addRobustRegression = addRobustRegression
|
2337 |
|
- )
|
2338 |
|
-
|
2339 |
|
- }
|
2340 |
|
-
|
2341 |
|
- }
|
|
2344 |
+ }
|
|
2345 |
+
|
|
2346 |
+ }
|
2342 |
2347 |
|
2343 |
2348 |
if (plotDiagnosticPlots) {
|
2344 |
2349 |
|
2345 |
2350 |
plotDiagnosticPlots_peakGene(GRN, outputFolder, gene.types = plotGeneTypes, useFiltered = FALSE, forceRerun= forceRerun)
|
2346 |
|
-
|
|
2351 |
+
|
2347 |
2352 |
}
|
2348 |
2353 |
|
2349 |
2354 |
GRN
|
2350 |
|
-
|
2351 |
|
-
|
|
2355 |
+
|
|
2356 |
+
|
2352 |
2357 |
|
2353 |
2358 |
}
|
2354 |
2359 |
|
... |
... |
@@ -2497,7 +2502,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2497 |
2502 |
nCores = 1
|
2498 |
2503 |
}
|
2499 |
2504 |
|
2500 |
|
-
|
|
2505 |
+
|
2501 |
2506 |
#consensusPeaks = .createDF_fromCoordinates(getCounts(GRN, type = "peaks", norm = TRUE, 0)$peakID, "peakID")
|
2502 |
2507 |
|
2503 |
2508 |
consensusPeaks = GRN@data$peaks$consensusPeaks %>% dplyr::filter(!isFiltered)
|
... |
... |
@@ -2505,7 +2510,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2505 |
2510 |
if (!is.null(TADs)) {
|
2506 |
2511 |
|
2507 |
2512 |
futile.logger::flog.info(paste0("Integrate Hi-C data and overlap peaks and HI-C domains"))
|
2508 |
|
-
|
|
2513 |
+
|
2509 |
2514 |
# Check format
|
2510 |
2515 |
checkmate::assertCharacter(TADs$X1)
|
2511 |
2516 |
checkmate::assertIntegerish(TADs$X2, lower = 1)
|
... |
... |
@@ -2518,7 +2523,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2518 |
2523 |
} else {
|
2519 |
2524 |
colnames(TADs)[4] = "ID"
|
2520 |
2525 |
}
|
2521 |
|
-
|
|
2526 |
+
|
2522 |
2527 |
|
2523 |
2528 |
# Construct GRanges
|
2524 |
2529 |
query = .constructGRanges(consensusPeaks, seqlengths = .getChrLengths(genomeAssembly), genomeAssembly)
|
... |
... |
@@ -2549,10 +2554,10 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2549 |
2554 |
|
2550 |
2555 |
# Finally, do the actual overlaps
|
2551 |
2556 |
overlapsAll = GenomicRanges::findOverlaps(query, subject,
|
2552 |
|
- minoverlap = 1,
|
2553 |
|
- type = "any",
|
2554 |
|
- select = "all",
|
2555 |
|
- ignore.strand = TRUE)
|
|
2557 |
+ minoverlap = 1,
|
|
2558 |
+ type = "any",
|
|
2559 |
+ select = "all",
|
|
2560 |
+ ignore.strand = TRUE)
|
2556 |
2561 |
|
2557 |
2562 |
|
2558 |
2563 |
query_row_ids = S4Vectors::queryHits(overlapsAll)
|
... |
... |
@@ -2613,7 +2618,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2613 |
2618 |
}
|
2614 |
2619 |
|
2615 |
2620 |
# ITERATE THROUGH ALL PEAK_GENE PAIRS
|
2616 |
|
-
|
|
2621 |
+
|
2617 |
2622 |
permIndex = as.character(perm)
|
2618 |
2623 |
|
2619 |
2624 |
countsPeaks.clean = dplyr::select(getCounts(GRN, type = "peaks", norm = TRUE, permuted = FALSE), -peakID)
|
... |
... |
@@ -2641,7 +2646,7 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2641 |
2646 |
nCores = 1
|
2642 |
2647 |
}
|
2643 |
2648 |
|
2644 |
|
-
|
|
2649 |
+
|
2645 |
2650 |
res.l = .execInParallelGen(nCores, returnAsList = TRUE, listNames = NULL, iteration = 0:startIndexMax, verbose = FALSE, functionName = .correlateData,
|
2646 |
2651 |
chunksize = chunksize, maxRow = maxRow,
|
2647 |
2652 |
counts1 = countsPeaks.clean, counts2 = countsRNA.clean, map1 = map_peaks, map2 = map_rna,
|
... |
... |
@@ -2671,20 +2676,20 @@ addConnections_peak_gene <- function(GRN, overlapTypeGene = "TSS", corMethod = "
|
2671 |
2676 |
dplyr::left_join(overlaps.sub.filt.df, by = c("gene.ENSEMBL", "peak.ID")) %>%
|
2672 |
2677 |
# Integrate TAD IDs also
|
2673 |
2678 |
dplyr::left_join(dplyr::select(peak.TADs.df, peak.ID, tad.ID), by = c("peak.ID")) %>%
|
2674 |
|
-
|
|
2679 |
+
|
2675 |
2680 |
dplyr::select(tidyselect::all_of(selectColumns))) %>%
|
2676 |
2681 |
dplyr::mutate(peak.ID = as.factor(peak.ID),
|
2677 |
2682 |
gene.ENSEMBL = as.factor(gene.ENSEMBL),
|
2678 |
2683 |
tad.ID = as.factor(tad.ID)) %>%
|
2679 |
2684 |
dplyr::rename(peak_gene.r = r,
|
2680 |
2685 |
peak_gene.p_raw = p.raw)
|
2681 |
|
-
|
|
2686 |
+
|
2682 |
2687 |
if (addRobustRegression) {
|
2683 |