... | ... |
@@ -7,7 +7,7 @@ Author: Aaron Taudt, Bjorn Bakker, David Porubsky |
7 | 7 |
Maintainer: Aaron Taudt <aaron.taudt@gmail.com> |
8 | 8 |
Description: This package implements functions for CNV calling, plotting, export and analysis from whole-genome single cell sequencing data. |
9 | 9 |
Depends: R (>= 3.1.0), GenomicRanges, cowplot |
10 |
-Imports: foreach, doParallel, BiocGenerics, IRanges, Rsamtools, Biostrings, GenomicAlignments, preseqR, reshape2, ggdendro, ReorderCluster, mclust |
|
10 |
+Imports: foreach, doParallel, GenomeInfoDb, BiocGenerics, IRanges, Rsamtools, Biostrings, GenomicAlignments, preseqR, reshape2, ggdendro, ReorderCluster, mclust |
|
11 | 11 |
Suggests: knitr, testthat |
12 | 12 |
License: file LICENSE |
13 | 13 |
LazyLoad: yes |
... | ... |
@@ -6,7 +6,7 @@ S3method(plot,aneuHMM) |
6 | 6 |
S3method(plot,character) |
7 | 7 |
export(Aneufinder) |
8 | 8 |
export(bam2binned) |
9 |
-export(bed2GRanges) |
|
9 |
+export(bed2binned) |
|
10 | 10 |
export(clusterByQuality) |
11 | 11 |
export(concGRangesInRange) |
12 | 12 |
export(correctGC) |
... | ... |
@@ -22,13 +22,16 @@ export(getSCEcoordinates) |
22 | 22 |
export(heatmapAneuploidies) |
23 | 23 |
export(heatmapGenomewide) |
24 | 24 |
export(hotspotter) |
25 |
+export(importBed) |
|
25 | 26 |
export(karyotypeMeasures) |
26 | 27 |
export(loadGRangesFromFiles) |
27 | 28 |
export(loadHmmsFromFiles) |
29 |
+export(simulateReads) |
|
28 | 30 |
export(smoothBinning) |
29 | 31 |
export(stateColors) |
30 | 32 |
export(strandColors) |
31 | 33 |
export(subsetByCNVprofile) |
34 |
+export(variableWidthBins) |
|
32 | 35 |
import(GenomicRanges) |
33 | 36 |
import(IRanges) |
34 | 37 |
import(cowplot) |
... | ... |
@@ -38,8 +41,12 @@ import(ggdendro) |
38 | 41 |
import(preseqR) |
39 | 42 |
import(reshape2) |
40 | 43 |
importFrom(BiocGenerics,as.vector) |
44 |
+importFrom(Biostrings,BString) |
|
45 |
+importFrom(Biostrings,BStringSet) |
|
46 |
+importFrom(Biostrings,DNAStringSet) |
|
41 | 47 |
importFrom(Biostrings,Views) |
42 | 48 |
importFrom(Biostrings,alphabetFrequency) |
49 |
+importFrom(GenomeInfoDb,fetchExtendedChromInfoFromUCSC) |
|
43 | 50 |
importFrom(GenomicAlignments,first) |
44 | 51 |
importFrom(GenomicAlignments,readGAlignmentPairsFromBam) |
45 | 52 |
importFrom(GenomicAlignments,readGAlignmentsFromBam) |
... | ... |
@@ -113,7 +113,7 @@ doParallel::registerDoParallel(cl) |
113 | 113 |
if (!file.exists(binpath.uncorrected)) { dir.create(binpath.uncorrected) } |
114 | 114 |
files <- list.files(inputfolder, full.names=TRUE, recursive=TRUE, pattern=paste0('.',conf[['format']],'$')) |
115 | 115 |
### Binning ### |
116 |
-message("Binning the data ...", appendLF=FALSE); ptm <- proc.time() |
|
116 |
+ptm <- startTimedMessage("Binning the data ...") |
|
117 | 117 |
temp <- foreach (file = files, .packages=c('aneufinder')) %dopar% { |
118 | 118 |
existing.binfiles <- grep(basename(file), list.files(binpath.uncorrected), value=TRUE) |
119 | 119 |
existing.binsizes <- as.numeric(unlist(lapply(strsplit(existing.binfiles, split='binsize_|_reads.per.bin_|_\\.RData'), '[[', 2))) |
... | ... |
@@ -143,10 +143,10 @@ temp <- foreach (file = files, .packages=c('aneufinder')) %dopar% { |
143 | 143 |
}) |
144 | 144 |
} |
145 | 145 |
} |
146 |
-time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
146 |
+stopTimedMessage(ptm) |
|
147 | 147 |
|
148 | 148 |
### Export read fragments as browser file ### |
149 |
-message("Exporting data as browser files ...", appendLF=FALSE); ptm <- proc.time() |
|
149 |
+ptm <- startTimedMessage("Exporting data as browser files ...") |
|
150 | 150 |
if (!file.exists(readsbrowserpath)) { dir.create(readsbrowserpath) } |
151 | 151 |
readfiles <- list.files(readspath,pattern='.RData$',full.names=TRUE) |
152 | 152 |
temp <- foreach (file = readfiles, .packages=c('aneufinder')) %dopar% { |
... | ... |
@@ -160,14 +160,14 @@ temp <- foreach (file = readfiles, .packages=c('aneufinder')) %dopar% { |
160 | 160 |
}) |
161 | 161 |
} |
162 | 162 |
} |
163 |
-time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
163 |
+stopTimedMessage(ptm) |
|
164 | 164 |
|
165 | 165 |
#================= |
166 | 166 |
### Correction ### |
167 | 167 |
#================= |
168 | 168 |
if (!is.null(conf[['correction.method']])) { |
169 | 169 |
|
170 |
- message(paste0(conf[['correction.method']]," correction ..."), appendLF=FALSE); ptm <- proc.time() |
|
170 |
+ ptm <- startTimedMessage(paste0(conf[['correction.method']]," correction ...")) |
|
171 | 171 |
if (conf[['correction.method']]=='GC') { |
172 | 172 |
binpath.corrected <- file.path(outputfolder,'binned_GC-corrected') |
173 | 173 |
if (!file.exists(binpath.corrected)) { dir.create(binpath.corrected) } |
... | ... |
@@ -202,7 +202,7 @@ if (!is.null(conf[['correction.method']])) { |
202 | 202 |
} |
203 | 203 |
binpath <- binpath.corrected |
204 | 204 |
} |
205 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
205 |
+ stopTimedMessage(ptm) |
|
206 | 206 |
|
207 | 207 |
} else { |
208 | 208 |
binpath <- binpath.uncorrected |
... | ... |
@@ -213,7 +213,7 @@ if (!is.null(conf[['correction.method']])) { |
213 | 213 |
#=============== |
214 | 214 |
if ('univariate' %in% conf[['method']]) { |
215 | 215 |
|
216 |
- message("Running univariate HMMs ...", appendLF=FALSE); ptm <- proc.time() |
|
216 |
+ ptm <- startTimedMessage("Running univariate HMMs ...") |
|
217 | 217 |
if (!file.exists(CNVpath)) { dir.create(CNVpath) } |
218 | 218 |
|
219 | 219 |
files <- list.files(binpath, full.names=TRUE, recursive=TRUE, pattern='.RData$') |
... | ... |
@@ -228,7 +228,7 @@ if ('univariate' %in% conf[['method']]) { |
228 | 228 |
stop(file,'\n',err) |
229 | 229 |
}) |
230 | 230 |
} |
231 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
231 |
+ stopTimedMessage(ptm) |
|
232 | 232 |
|
233 | 233 |
#=================== |
234 | 234 |
### Plotting CNV ### |
... | ... |
@@ -241,7 +241,7 @@ if ('univariate' %in% conf[['method']]) { |
241 | 241 |
#----------------------- |
242 | 242 |
## Plot distributions ## |
243 | 243 |
#----------------------- |
244 |
- message("Plotting distributions ...", appendLF=FALSE); ptm <- proc.time() |
|
244 |
+ ptm <- startTimedMessage("Plotting distributions ...") |
|
245 | 245 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
246 | 246 |
savename <- file.path(CNVplotpath,paste0('distributions_',sub('_$','',pattern),'.pdf')) |
247 | 247 |
if (!file.exists(savename)) { |
... | ... |
@@ -259,11 +259,11 @@ if ('univariate' %in% conf[['method']]) { |
259 | 259 |
d <- dev.off() |
260 | 260 |
} |
261 | 261 |
} |
262 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
262 |
+ stopTimedMessage(ptm) |
|
263 | 263 |
#------------------ |
264 | 264 |
## Plot heatmaps ## |
265 | 265 |
#------------------ |
266 |
- message("Plotting genomewide heatmaps ...", appendLF=FALSE); ptm <- proc.time() |
|
266 |
+ ptm <- startTimedMessage("Plotting genomewide heatmaps ...") |
|
267 | 267 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
268 | 268 |
ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE) |
269 | 269 |
ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=TRUE) |
... | ... |
@@ -276,8 +276,8 @@ if ('univariate' %in% conf[['method']]) { |
276 | 276 |
warning("Plotting genomewide heatmaps: No files for pattern ",pattern," found.") |
277 | 277 |
} |
278 | 278 |
} |
279 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
280 |
- message("Plotting chromosome heatmaps ...", appendLF=FALSE); ptm <- proc.time() |
|
279 |
+ stopTimedMessage(ptm) |
|
280 |
+ ptm <- startTimedMessage("Plotting chromosome heatmaps ...") |
|
281 | 281 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
282 | 282 |
ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE) |
283 | 283 |
ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=TRUE) |
... | ... |
@@ -293,11 +293,11 @@ if ('univariate' %in% conf[['method']]) { |
293 | 293 |
warning("Plotting chromosome heatmaps: No files for pattern ",pattern," found.") |
294 | 294 |
} |
295 | 295 |
} |
296 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
296 |
+ stopTimedMessage(ptm) |
|
297 | 297 |
#------------------ |
298 | 298 |
## Plot arrayCGH ## |
299 | 299 |
#------------------ |
300 |
- message("Making arrayCGH plots ...", appendLF=FALSE); ptm <- proc.time() |
|
300 |
+ ptm <- startTimedMessage("Making arrayCGH plots ...") |
|
301 | 301 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
302 | 302 |
savename <- file.path(CNVplotpath,paste0('arrayCGH_',sub('_$','',pattern),'.pdf')) |
303 | 303 |
if (!file.exists(savename)) { |
... | ... |
@@ -315,11 +315,11 @@ if ('univariate' %in% conf[['method']]) { |
315 | 315 |
d <- dev.off() |
316 | 316 |
} |
317 | 317 |
} |
318 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
318 |
+ stopTimedMessage(ptm) |
|
319 | 319 |
#-------------------- |
320 | 320 |
## Plot karyograms ## |
321 | 321 |
#-------------------- |
322 |
- message("Plotting karyograms ...", appendLF=FALSE); ptm <- proc.time() |
|
322 |
+ ptm <- startTimedMessage("Plotting karyograms ...") |
|
323 | 323 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
324 | 324 |
savename <- file.path(CNVplotpath,paste0('karyograms_',sub('_$','',pattern),'.pdf')) |
325 | 325 |
if (!file.exists(savename)) { |
... | ... |
@@ -337,11 +337,11 @@ if ('univariate' %in% conf[['method']]) { |
337 | 337 |
d <- dev.off() |
338 | 338 |
} |
339 | 339 |
} |
340 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
340 |
+ stopTimedMessage(ptm) |
|
341 | 341 |
#------------------------- |
342 | 342 |
## Export browser files ## |
343 | 343 |
#------------------------- |
344 |
- message("Exporting browser files ...", appendLF=FALSE); ptm <- proc.time() |
|
344 |
+ ptm <- startTimedMessage("Exporting browser files ...") |
|
345 | 345 |
if (!file.exists(CNVbrowserpath)) { dir.create(CNVbrowserpath) } |
346 | 346 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
347 | 347 |
savename <- file.path(CNVbrowserpath,sub('_$','',pattern)) |
... | ... |
@@ -351,7 +351,7 @@ if ('univariate' %in% conf[['method']]) { |
351 | 351 |
exportCNVs(ifiles, filename=savename, cluster=conf[['cluster.plots']], export.CNV=TRUE, export.SCE=FALSE) |
352 | 352 |
} |
353 | 353 |
} |
354 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
354 |
+ stopTimedMessage(ptm) |
|
355 | 355 |
} |
356 | 356 |
|
357 | 357 |
#=============== |
... | ... |
@@ -359,7 +359,7 @@ if ('univariate' %in% conf[['method']]) { |
359 | 359 |
#=============== |
360 | 360 |
if ('bivariate' %in% conf[['method']]) { |
361 | 361 |
|
362 |
- message("Running bivariate HMMs ...", appendLF=FALSE); ptm <- proc.time() |
|
362 |
+ ptm <- startTimedMessage("Running bivariate HMMs ...") |
|
363 | 363 |
if (!file.exists(SCEpath)) { dir.create(SCEpath) } |
364 | 364 |
|
365 | 365 |
files <- list.files(binpath, full.names=TRUE, recursive=TRUE, pattern='.RData$') |
... | ... |
@@ -371,24 +371,24 @@ if ('bivariate' %in% conf[['method']]) { |
371 | 371 |
## Add SCE coordinates to model |
372 | 372 |
reads.file <- file.path(readspath, paste0(model$ID,'.RData')) |
373 | 373 |
model$sce <- suppressMessages( getSCEcoordinates(model, resolution=conf[['resolution']], min.segwidth=conf[['min.segwidth']], fragments=reads.file, min.reads=conf[['min.reads']]) ) |
374 |
- message("Saving to file ",savename," ...", appendLF=FALSE); ptm <- proc.time() |
|
374 |
+ ptm <- startTimedMessage("Saving to file ",savename," ...") |
|
375 | 375 |
save(model, file=savename) |
376 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
376 |
+ stopTimedMessage(ptm) |
|
377 | 377 |
# } else { |
378 | 378 |
# model <- get(load(savename)) |
379 | 379 |
# model$sce <- suppressMessages( getSCEcoordinates(model, resolution=conf[['resolution']], min.segwidth=conf[['min.segwidth']]) ) |
380 |
-# message("Saving to file ",savename," ...", appendLF=FALSE); ptm <- proc.time() |
|
380 |
+# ptm <- startTimedMessage("Saving to file ",savename," ...") |
|
381 | 381 |
# save(model, file=savename) |
382 |
-# time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
382 |
+# stopTimedMessage(ptm) |
|
383 | 383 |
} |
384 | 384 |
}, error = function(err) { |
385 | 385 |
stop(file,'\n',err) |
386 | 386 |
}) |
387 | 387 |
} |
388 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
388 |
+ stopTimedMessage(ptm) |
|
389 | 389 |
|
390 | 390 |
### Finding hotspots ### |
391 |
- message("Finding SCE hotspots ...", appendLF=FALSE); ptm <- proc.time() |
|
391 |
+ ptm <- startTimedMessage("Finding SCE hotspots ...") |
|
392 | 392 |
hotspots <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
393 | 393 |
ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE) |
394 | 394 |
ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=TRUE) |
... | ... |
@@ -401,7 +401,7 @@ if ('bivariate' %in% conf[['method']]) { |
401 | 401 |
return(hotspot) |
402 | 402 |
} |
403 | 403 |
names(hotspots) <- patterns |
404 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
404 |
+ stopTimedMessage(ptm) |
|
405 | 405 |
|
406 | 406 |
#=================== |
407 | 407 |
### Plotting SCE ### |
... | ... |
@@ -414,7 +414,7 @@ if ('bivariate' %in% conf[['method']]) { |
414 | 414 |
#----------------------- |
415 | 415 |
## Plot distributions ## |
416 | 416 |
#----------------------- |
417 |
- message("Plotting distributions ...", appendLF=FALSE); ptm <- proc.time() |
|
417 |
+ ptm <- startTimedMessage("Plotting distributions ...") |
|
418 | 418 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
419 | 419 |
savename <- file.path(SCEplotpath,paste0('distributions_',sub('_$','',pattern),'.pdf')) |
420 | 420 |
if (!file.exists(savename)) { |
... | ... |
@@ -432,11 +432,11 @@ if ('bivariate' %in% conf[['method']]) { |
432 | 432 |
d <- dev.off() |
433 | 433 |
} |
434 | 434 |
} |
435 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
435 |
+ stopTimedMessage(ptm) |
|
436 | 436 |
#------------------ |
437 | 437 |
## Plot heatmaps ## |
438 | 438 |
#------------------ |
439 |
- message("Plotting heatmaps ...", appendLF=FALSE); ptm <- proc.time() |
|
439 |
+ ptm <- startTimedMessage("Plotting heatmaps ...") |
|
440 | 440 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
441 | 441 |
ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE) |
442 | 442 |
ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=TRUE) |
... | ... |
@@ -464,11 +464,11 @@ if ('bivariate' %in% conf[['method']]) { |
464 | 464 |
warning("Plotting chromosome heatmaps: No files for pattern ",pattern," found.") |
465 | 465 |
} |
466 | 466 |
} |
467 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
467 |
+ stopTimedMessage(ptm) |
|
468 | 468 |
#------------------ |
469 | 469 |
## Plot arrayCGH ## |
470 | 470 |
#------------------ |
471 |
- message("Making arrayCGH plots ...", appendLF=FALSE); ptm <- proc.time() |
|
471 |
+ ptm <- startTimedMessage("Making arrayCGH plots ...") |
|
472 | 472 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
473 | 473 |
savename <- file.path(SCEplotpath,paste0('arrayCGH_',sub('_$','',pattern),'.pdf')) |
474 | 474 |
if (!file.exists(savename)) { |
... | ... |
@@ -486,11 +486,11 @@ if ('bivariate' %in% conf[['method']]) { |
486 | 486 |
d <- dev.off() |
487 | 487 |
} |
488 | 488 |
} |
489 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
489 |
+ stopTimedMessage(ptm) |
|
490 | 490 |
#-------------------- |
491 | 491 |
## Plot karyograms ## |
492 | 492 |
#-------------------- |
493 |
- message("Plotting karyograms ...", appendLF=FALSE); ptm <- proc.time() |
|
493 |
+ ptm <- startTimedMessage("Plotting karyograms ...") |
|
494 | 494 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
495 | 495 |
savename <- file.path(SCEplotpath,paste0('karyograms_',sub('_$','',pattern),'.pdf')) |
496 | 496 |
if (!file.exists(savename)) { |
... | ... |
@@ -508,11 +508,11 @@ if ('bivariate' %in% conf[['method']]) { |
508 | 508 |
d <- dev.off() |
509 | 509 |
} |
510 | 510 |
} |
511 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
511 |
+ stopTimedMessage(ptm) |
|
512 | 512 |
#------------------------- |
513 | 513 |
## Export browser files ## |
514 | 514 |
#------------------------- |
515 |
- message("Exporting browser files ...", appendLF=FALSE); ptm <- proc.time() |
|
515 |
+ ptm <- startTimedMessage("Exporting browser files ...") |
|
516 | 516 |
if (!file.exists(SCEbrowserpath)) { dir.create(SCEbrowserpath) } |
517 | 517 |
temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% { |
518 | 518 |
savename <- file.path(SCEbrowserpath,sub('_$','',pattern)) |
... | ... |
@@ -526,7 +526,7 @@ if ('bivariate' %in% conf[['method']]) { |
526 | 526 |
exportGRanges(hotspots[[pattern]], filename=savename, trackname=basename(savename), score=hotspots[[pattern]]$num.events) |
527 | 527 |
} |
528 | 528 |
} |
529 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
529 |
+ stopTimedMessage(ptm) |
|
530 | 530 |
} |
531 | 531 |
|
532 | 532 |
stopCluster(cl) |
... | ... |
@@ -18,14 +18,15 @@ NULL |
18 | 18 |
#' @describeIn binning Bin reads in BAM format |
19 | 19 |
#' @inheritParams align2binned |
20 | 20 |
#' @inheritParams bam2GRanges |
21 |
+#' @param bamfile A file in BAM format. Alternatively a \code{\link{GRanges}} with aligned reads. |
|
21 | 22 |
#' @export |
22 |
-bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEndReads=FALSE, max.fragment.width=1000, outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
23 |
+bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEndReads=FALSE, max.fragment.width=1000, outputfolder.binned="binned_data", binsizes=NULL, variable.width.reference=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
23 | 24 |
call <- match.call() |
24 | 25 |
underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='') |
25 | 26 |
message("\n",call[[1]],"():") |
26 | 27 |
message(underline) |
27 | 28 |
ptm <- proc.time() |
28 |
- binned.data <- align2binned(bamfile, format="bam", ID=ID, bamindex=bamindex, pairedEndReads=pairedEndReads, max.fragment.width=max.fragment.width, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
29 |
+ binned.data <- align2binned(bamfile, format="bam", assembly=NULL, ID=ID, bamindex=bamindex, pairedEndReads=pairedEndReads, max.fragment.width=max.fragment.width, outputfolder.binned=outputfolder.binned, binsizes=binsizes, variable.width.reference=variable.width.reference, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
29 | 30 |
time <- proc.time() - ptm |
30 | 31 |
message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s") |
31 | 32 |
return(binned.data) |
... | ... |
@@ -33,14 +34,16 @@ bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEn |
33 | 34 |
|
34 | 35 |
#' @describeIn binning Bin reads in BED format |
35 | 36 |
#' @inheritParams align2binned |
36 |
-#' @param bedfile A file in BED format. |
|
37 |
-bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
37 |
+#' @inheritParams bed2GRanges |
|
38 |
+#' @param bedfile A file in BED format. Alternatively a \code{\link{GRanges}} with aligned reads. |
|
39 |
+#' @export |
|
40 |
+bed2binned <- function(bedfile, assembly, ID=basename(bedfile), outputfolder.binned="binned_data", binsizes=NULL, variable.width.reference=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
38 | 41 |
call <- match.call() |
39 | 42 |
underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='') |
40 | 43 |
message("\n",call[[1]],"():") |
41 | 44 |
message(underline) |
42 | 45 |
ptm <- proc.time() |
43 |
- binned.data <- align2binned(bedfile, format="bed", ID=ID, chrom.length.file=chrom.length.file, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
46 |
+ binned.data <- align2binned(bedfile, format="bed", assembly=assembly, ID=ID, outputfolder.binned=outputfolder.binned, binsizes=binsizes, variable.width.reference=variable.width.reference, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only) |
|
44 | 47 |
time <- proc.time() - ptm |
45 | 48 |
message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s") |
46 | 49 |
return(binned.data) |
... | ... |
@@ -50,13 +53,14 @@ bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputf |
50 | 53 |
#' |
51 | 54 |
#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows. |
52 | 55 |
#' |
53 |
-#' @param file A file with aligned reads. |
|
56 |
+#' @param file A file with aligned reads. Alternatively a \code{\link{GRanges}} with aligned reads. |
|
54 | 57 |
#' @param format One of \code{c('bam', 'bed')}. |
55 | 58 |
#' @param ID An identifier that will be used to identify the file throughout the workflow and in plotting. |
56 | 59 |
#' @inheritParams bam2GRanges |
57 |
-#' @param chrom.length.file A file which contains the chromosome lengths in basepairs. The first column contains the chromosome name and the second column the length (see also \code{\link{chrom.length.file}}). |
|
60 |
+#' @inheritParams bed2GRanges |
|
58 | 61 |
#' @param outputfolder.binned Folder to which the binned data will be saved. If the specified folder does not exist, it will be created. |
59 | 62 |
#' @param binsizes An integer vector with bin sizes. If more than one value is given, output files will be produced for each bin size. |
63 |
+#' @param variable.width.reference A BAM file that is used as reference to produce variable width bins. See \code{\link{variableWidthBins}} for details. |
|
60 | 64 |
#' @param reads.per.bin Approximate number of desired reads per bin. The bin size will be selected accordingly. Output files are produced for each value. |
61 | 65 |
#' @param stepsize Fraction of the binsize that the sliding window is offset at each step. Example: If \code{stepsize=0.1} and \code{binsizes=c(200000,500000)}, the actual stepsize in basepairs is 20000 and 50000, respectively. |
62 | 66 |
#' @param chromosomes If only a subset of the chromosomes should be binned, specify them here. |
... | ... |
@@ -69,7 +73,7 @@ bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputf |
69 | 73 |
#' @param reads.overwrite Whether or not an existing file with read fragments should be overwritten. |
70 | 74 |
#' @param reads.only If \code{TRUE} only read fragments are stored and/or returned and no binning is done. |
71 | 75 |
#' @return The function produces a \link{GRanges} object with one meta data column 'reads' that contains the read count. This binned data will be either written to file (\code{save.as.RData=FALSE}) or given as return value (\code{save.as.RData=FALSE}). |
72 |
-align2binned <- function(file, format, ID=basename(file), bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, chrom.length.file, outputfolder.binned="binned_data", binsizes=200000, reads.per.bin=NULL, stepsize=NULL, save.as.RData=FALSE, calc.complexity=TRUE, call=match.call(), reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
76 |
+align2binned <- function(file, format, assembly, ID=basename(file), bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, outputfolder.binned="binned_data", binsizes=200000, variable.width.reference=NULL, reads.per.bin=NULL, stepsize=NULL, save.as.RData=FALSE, calc.complexity=TRUE, call=match.call(), reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) { |
|
73 | 77 |
|
74 | 78 |
## Check user input |
75 | 79 |
if (reads.return==FALSE & reads.only==FALSE) { |
... | ... |
@@ -83,32 +87,34 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos |
83 | 87 |
dir.create(outputfolder.binned) |
84 | 88 |
} |
85 | 89 |
|
86 |
- ### Read in the data |
|
87 |
- message("Reading file ",basename(file)," ...", appendLF=FALSE); ptm <- proc.time() |
|
88 |
- ## BED (0-based) |
|
89 |
- if (format == "bed") { |
|
90 |
- # File with chromosome lengths (1-based) |
|
91 |
- chrom.lengths.df <- read.table(chrom.length.file) |
|
92 |
- chrom.lengths <- chrom.lengths.df[,2] |
|
93 |
- names(chrom.lengths) <- chrom.lengths.df[,1] |
|
94 |
- # File with reads, determine classes first for faster import (0-based) |
|
95 |
- tab5rows <- read.table(file, nrows=5) |
|
96 |
- classes.in.bed <- sapply(tab5rows, class) |
|
97 |
- classes <- rep("NULL",length(classes.in.bed)) |
|
98 |
- classes[c(1:3,6)] <- classes.in.bed[c(1:3,6)] |
|
99 |
- data <- read.table(file, colClasses=classes) |
|
100 |
- # Convert to GRanges object |
|
101 |
- data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]), strand=Rle(strand("*"), nrow(data))) # start+1 to go from [0,x) -> [1,x] |
|
102 |
- seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
|
103 |
- ## BAM (1-based) |
|
104 |
- } else if (format == "bam") { |
|
105 |
- if (calc.complexity || !remove.duplicate.reads) { |
|
106 |
- data <- suppressMessages( bam2GRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=FALSE, min.mapq=min.mapq, max.fragment.width=max.fragment.width) ) |
|
107 |
- } else { |
|
108 |
- data <- suppressMessages( bam2GRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=TRUE, min.mapq=min.mapq, max.fragment.width=max.fragment.width) ) |
|
90 |
+ if (is.character(file)) { |
|
91 |
+ ### Read in the data |
|
92 |
+ if (format == "bed") { |
|
93 |
+ ## BED (0-based) |
|
94 |
+ if (calc.complexity || !remove.duplicate.reads) { |
|
95 |
+ data <- bed2GRanges(file, assembly=assembly, chromosomes=chromosomes, remove.duplicate.reads=FALSE, min.mapq=min.mapq, max.fragment.width=max.fragment.width) |
|
96 |
+ } else { |
|
97 |
+ data <- bed2GRanges(file, assembly=assembly, chromosomes=chromosomes, remove.duplicate.reads=TRUE, min.mapq=min.mapq, max.fragment.width=max.fragment.width) |
|
98 |
+ } |
|
99 |
+ } else if (format == "bam") { |
|
100 |
+ ## BAM (1-based) |
|
101 |
+ if (calc.complexity || !remove.duplicate.reads) { |
|
102 |
+ data <- bam2GRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=FALSE, min.mapq=min.mapq, max.fragment.width=max.fragment.width) |
|
103 |
+ } else { |
|
104 |
+ data <- bam2GRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=TRUE, min.mapq=min.mapq, max.fragment.width=max.fragment.width) |
|
105 |
+ } |
|
106 |
+ } |
|
107 |
+ } else if (is(file, 'GRanges')) { |
|
108 |
+ data <- file |
|
109 |
+ err <- tryCatch({ |
|
110 |
+ !is.character(ID) |
|
111 |
+ }, error = function(err) { |
|
112 |
+ TRUE |
|
113 |
+ }) |
|
114 |
+ if (err) { |
|
115 |
+ ID <- 'GRanges' |
|
109 | 116 |
} |
110 | 117 |
} |
111 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
112 | 118 |
|
113 | 119 |
## Select chromosomes to bin |
114 | 120 |
if (is.null(chromosomes)) { |
... | ... |
@@ -119,16 +125,18 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos |
119 | 125 |
### Complexity estimation ### |
120 | 126 |
complexity <- NULL |
121 | 127 |
if (calc.complexity) { |
122 |
- message(" calculating complexity ...", appendLF=FALSE); ptm <- proc.time() |
|
128 |
+ ptm <- startTimedMessage("Calculating complexity ...") |
|
123 | 129 |
complexity <- suppressMessages( estimateComplexity(data) ) |
124 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
130 |
+ stopTimedMessage(ptm) |
|
125 | 131 |
} |
126 | 132 |
if (remove.duplicate.reads) { |
133 |
+ ptm <- startTimedMessage("Removing duplicate reads ...") |
|
127 | 134 |
sp <- start(data)[as.logical(strand(data)=='+')] |
128 | 135 |
sp1 <- c(sp[length(sp)], sp[-length(sp)]) |
129 | 136 |
sm <- start(data)[as.logical(strand(data)=='-')] |
130 | 137 |
sm1 <- c(sm[length(sm)], sm[-length(sm)]) |
131 | 138 |
data <- c(data[strand(data)=='+'][sp!=sp1], data[strand(data)=='-'][sm!=sm1]) |
139 |
+ stopTimedMessage(ptm) |
|
132 | 140 |
} |
133 | 141 |
|
134 | 142 |
### Return fragments if desired ### |
... | ... |
@@ -136,7 +144,9 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos |
136 | 144 |
if (!file.exists(outputfolder.reads)) { dir.create(outputfolder.reads) } |
137 | 145 |
filename <- file.path(outputfolder.reads,paste0(ID,'.RData')) |
138 | 146 |
if (reads.overwrite | !file.exists(filename)) { |
147 |
+ ptm <- startTimedMessage(paste0("Saving fragments to ", filename, " ...")) |
|
139 | 148 |
save(data, file=filename) |
149 |
+ stopTimedMessage(ptm) |
|
140 | 150 |
} |
141 | 151 |
} |
142 | 152 |
if (reads.return) { |
... | ... |
@@ -147,20 +157,22 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos |
147 | 157 |
} |
148 | 158 |
|
149 | 159 |
### Coverage and percentage of genome covered ### |
160 |
+ ptm <- startTimedMessage("Calculating coverage ...") |
|
150 | 161 |
genome.length <- sum(as.numeric(seqlengths(data))) |
151 | 162 |
data.strand <- data |
152 | 163 |
strand(data.strand) <- '*' |
153 | 164 |
coverage <- sum(as.numeric(width(data.strand))) / genome.length |
154 | 165 |
genome.covered <- sum(as.numeric(width(reduce(data.strand)))) / genome.length |
155 | 166 |
## Per chromosome |
156 |
- coverage.per.chrom <- vector() |
|
157 |
- genome.covered.per.chrom <- vector() |
|
167 |
+ coverage.per.chrom <- numeric(length=length(chroms2use)) |
|
168 |
+ genome.covered.per.chrom <- numeric(length=length(chroms2use)) |
|
158 | 169 |
for (chr in chroms2use) { |
159 | 170 |
data.strand.chr <- data.strand[seqnames(data.strand)==chr] |
160 | 171 |
coverage.per.chrom[chr] <- sum(as.numeric(width(data.strand.chr))) / seqlengths(data.strand)[chr] |
161 | 172 |
genome.covered.per.chrom[chr] <- sum(as.numeric(width(reduce(data.strand.chr)))) / seqlengths(data.strand)[chr] |
162 | 173 |
} |
163 | 174 |
coverage <- list(coverage=coverage, genome.covered=genome.covered, coverage.per.chrom=coverage.per.chrom, genome.covered.per.chrom=genome.covered.per.chrom) |
175 |
+ stopTimedMessage(ptm) |
|
164 | 176 |
|
165 | 177 |
|
166 | 178 |
## Pad binsizes and reads.per.bin with each others value |
... | ... |
@@ -171,56 +183,67 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos |
171 | 183 |
reads.per.bin <- c(reads.per.bin.add, reads.per.bin) |
172 | 184 |
length.binsizes <- length(binsizes) |
173 | 185 |
|
186 |
+ ### Variable width bins ### |
|
187 |
+ if (!is.null(variable.width.reference)) { |
|
188 |
+ message("Making variable width bins:") |
|
189 |
+ bins.list <- variableWidthBins(variable.width.reference, bamindex=variable.width.reference, binsizes=binsizes, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, max.fragment.width=max.fragment.width) |
|
190 |
+ message("Finished making variable width bins.") |
|
191 |
+ } |
|
174 | 192 |
### Loop over all binsizes ### |
175 | 193 |
data.plus <- data[strand(data)=='+'] |
176 | 194 |
data.minus <- data[strand(data)=='-'] |
177 | 195 |
skipped.chroms <- character() |
196 |
+ binned.data.list <- list() |
|
178 | 197 |
for (ibinsize in 1:length.binsizes) { |
179 | 198 |
binsize <- binsizes[ibinsize] |
180 | 199 |
readsperbin <- reads.per.bin[ibinsize] |
181 | 200 |
message("Binning into bin size ",binsize," with on average ",readsperbin," reads per bin") |
182 | 201 |
|
183 |
- ### Loop over chromosomes ### |
|
184 |
- message(" binning genome ...", appendLF=FALSE); ptm <- proc.time() |
|
185 |
- binned.data <- GenomicRanges::GRangesList() |
|
186 |
- for (chromosome in chroms2use) { |
|
187 |
- |
|
188 |
- ## Check last incomplete bin |
|
189 |
- incomplete.bin <- seqlengths(data)[chromosome] %% binsize > 0 |
|
190 |
- if (incomplete.bin) { |
|
191 |
- numbin <- floor(seqlengths(data)[chromosome]/binsize) # floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end |
|
192 |
- } else { |
|
193 |
- numbin <- seqlengths(data)[chromosome]/binsize |
|
202 |
+ if (is.null(variable.width.reference)) { |
|
203 |
+ ### Fixed width bins ### |
|
204 |
+ ptm <- startTimedMessage("Making fixed-width bins ...") |
|
205 |
+ binned.data <- GenomicRanges::GRangesList() |
|
206 |
+ ## Loop over chromosomes |
|
207 |
+ for (chromosome in chroms2use) { |
|
208 |
+ ## Check last incomplete bin |
|
209 |
+ incomplete.bin <- seqlengths(data)[chromosome] %% binsize > 0 |
|
210 |
+ if (incomplete.bin) { |
|
211 |
+ numbin <- floor(seqlengths(data)[chromosome]/binsize) # floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end |
|
212 |
+ } else { |
|
213 |
+ numbin <- seqlengths(data)[chromosome]/binsize |
|
214 |
+ } |
|
215 |
+ if (numbin == 0) { |
|
216 |
+ skipped.chroms[chromosome] <- chromosome |
|
217 |
+ next |
|
218 |
+ } |
|
219 |
+ ## Initialize vectors |
|
220 |
+ chroms <- rep(chromosome,numbin) |
|
221 |
+ reads <- rep(0,numbin) |
|
222 |
+ start <- seq(from=1, by=binsize, length.out=numbin) |
|
223 |
+ end <- seq(from=binsize, by=binsize, length.out=numbin) |
|
224 |
+ # end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired |
|
225 |
+ |
|
226 |
+ ## Create binned chromosome as GRanges object |
|
227 |
+ i.binned.data <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbin), |
|
228 |
+ ranges = IRanges(start=start, end=end), |
|
229 |
+ strand = Rle(strand("*"), numbin) |
|
230 |
+ ) |
|
231 |
+ seqlengths(i.binned.data) <- seqlengths(data)[chromosome] |
|
232 |
+ |
|
233 |
+ suppressWarnings( |
|
234 |
+ binned.data[[chromosome]] <- i.binned.data |
|
235 |
+ ) |
|
236 |
+ |
|
237 |
+ } ## end loop chromosomes |
|
238 |
+ if (length(skipped.chroms)>0) { |
|
239 |
+ warning("Chromosomes ", paste0(skipped.chroms, collapse=', '), " are smaller than binsize. Skipped.") |
|
194 | 240 |
} |
195 |
- if (numbin == 0) { |
|
196 |
- skipped.chroms[chromosome] <- chromosome |
|
197 |
- next |
|
198 |
- } |
|
199 |
- ## Initialize vectors |
|
200 |
- chroms <- rep(chromosome,numbin) |
|
201 |
- reads <- rep(0,numbin) |
|
202 |
- start <- seq(from=1, by=binsize, length.out=numbin) |
|
203 |
- end <- seq(from=binsize, by=binsize, length.out=numbin) |
|
204 |
-# end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired |
|
205 |
- |
|
206 |
- ## Create binned chromosome as GRanges object |
|
207 |
- i.binned.data <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbin), |
|
208 |
- ranges = IRanges(start=start, end=end), |
|
209 |
- strand = Rle(strand("*"), numbin) |
|
210 |
- ) |
|
211 |
- seqlengths(i.binned.data) <- seqlengths(data)[chromosome] |
|
212 |
- |
|
213 |
- suppressWarnings( |
|
214 |
- binned.data[[chromosome]] <- i.binned.data |
|
215 |
- ) |
|
216 |
- |
|
217 |
- } ### end loop chromosomes ### |
|
218 |
- if (length(skipped.chroms)>0) { |
|
219 |
- warning("Chromosomes ", paste0(skipped.chroms, collapse=', '), " are smaller than binsize. Skipped.") |
|
241 |
+ binned.data <- unlist(binned.data) |
|
242 |
+ names(binned.data) <- NULL |
|
243 |
+ stopTimedMessage(ptm) |
|
244 |
+ } else { |
|
245 |
+ binned.data <- bins.list[[ibinsize]] |
|
220 | 246 |
} |
221 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
222 |
- binned.data <- unlist(binned.data) |
|
223 |
- names(binned.data) <- NULL |
|
224 | 247 |
|
225 | 248 |
### Loop over offsets ### |
226 | 249 |
countmatrices <- list() |
... | ... |
@@ -231,7 +254,7 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos |
231 | 254 |
} |
232 | 255 |
for (ioff in offsets) { |
233 | 256 |
## Count overlaps |
234 |
- message(paste0(" counting overlaps for offset ",ioff," ..."), appendLF=FALSE); ptm <- proc.time() |
|
257 |
+ ptm <- startTimedMessage(paste0("Counting overlaps for offset ",ioff," ...")) |
|
235 | 258 |
binned.data.shift <- suppressWarnings( shift(binned.data, shift=ioff) ) |
236 | 259 |
if (format=="bam" | format=="bed") { |
237 | 260 |
mcounts <- suppressWarnings( GenomicRanges::countOverlaps(binned.data.shift, data.minus) ) |
... | ... |
@@ -241,7 +264,7 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos |
241 | 264 |
countmatrix <- matrix(c(counts,mcounts,pcounts), ncol=3) |
242 | 265 |
colnames(countmatrix) <- c('counts','mcounts','pcounts') |
243 | 266 |
countmatrices[[as.character(ioff)]] <- countmatrix |
244 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
267 |
+ stopTimedMessage(ptm) |
|
245 | 268 |
} |
246 | 269 |
mcols(binned.data) <- as(countmatrices[['0']],'DataFrame') |
247 | 270 |
attr(binned.data,'offset.counts') <- countmatrices |
... | ... |
@@ -263,17 +286,21 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos |
263 | 286 |
if (save.as.RData==TRUE) { |
264 | 287 |
# Save to file |
265 | 288 |
filename <- paste0(ID,"_binsize_",format(binsize, scientific=TRUE, trim=TRUE),"_reads.per.bin_",readsperbin,"_.RData") |
266 |
- message("Saving to file ...", appendLF=FALSE); ptm <- proc.time() |
|
289 |
+ ptm <- startTimedMessage("Saving to file ...") |
|
267 | 290 |
# attr(binned.data, 'call') <- call # do not store along with GRanges because it inflates disk usage |
268 | 291 |
save(binned.data, file=file.path(outputfolder.binned,filename) ) |
269 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
292 |
+ stopTimedMessage(ptm) |
|
270 | 293 |
} else { |
271 | 294 |
# attr(binned.data, 'call') <- call |
272 |
- return(binned.data) |
|
295 |
+ binned.data.list[[as.character(binsize)]] <- binned.data |
|
273 | 296 |
} |
274 | 297 |
|
275 | 298 |
} ### end loop binsizes ### |
276 | 299 |
|
300 |
+ if (!save.as.RData) { |
|
301 |
+ return(binned.data.list) |
|
302 |
+ } |
|
303 |
+ |
|
277 | 304 |
|
278 | 305 |
} |
279 | 306 |
|
... | ... |
@@ -376,16 +403,17 @@ estimateComplexity <- function(reads) { |
376 | 403 |
#' |
377 | 404 |
#' Import aligned reads from a BAM file into a \code{\link{GRanges}} object. |
378 | 405 |
#' |
379 |
-#' @param bamfile A file in BAM format. |
|
406 |
+#' @param bamfile A sorted BAM file. |
|
380 | 407 |
#' @param bamindex BAM index file. Can be specified without the .bai ending. If the index file does not exist it will be created and a warning is issued. |
381 | 408 |
#' @param chromosomes If only a subset of the chromosomes should be imported, specify them here. |
382 | 409 |
#' @param pairedEndReads Set to \code{TRUE} if you have paired-end reads in your file. |
383 | 410 |
#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed. |
384 | 411 |
#' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=NULL} to keep all reads. |
385 | 412 |
#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments due to mapping errors of paired end reads. |
413 |
+#' @param what A character vector of fields that are returned. Type \code{\link[Rsamtools]{scanBamWhat}} to see what is available. |
|
386 | 414 |
#' @importFrom Rsamtools indexBam scanBamHeader ScanBamParam scanBamFlag |
387 | 415 |
#' @importFrom GenomicAlignments readGAlignmentPairsFromBam readGAlignmentsFromBam first |
388 |
-bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndReads=FALSE, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000) { |
|
416 |
+bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndReads=FALSE, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000, what='mapq') { |
|
389 | 417 |
|
390 | 418 |
## Check if bamindex exists |
391 | 419 |
bamindex.raw <- sub('\\.bai$', '', bamindex) |
... | ... |
@@ -412,21 +440,25 @@ bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndRe |
412 | 440 |
diffs <- paste0(diff, collapse=', ') |
413 | 441 |
warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.')) |
414 | 442 |
} |
443 |
+ |
|
415 | 444 |
## Import the file into GRanges |
445 |
+ ptm <- startTimedMessage("Reading file ",basename(bamfile)," ...") |
|
416 | 446 |
gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use), ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use])) |
417 | 447 |
if (!remove.duplicate.reads) { |
418 | 448 |
if (pairedEndReads) { |
419 |
- data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq')) |
|
449 |
+ data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what)) |
|
420 | 450 |
} else { |
421 |
- data.raw <- GenomicAlignments::readGAlignmentsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq')) |
|
451 |
+ data.raw <- GenomicAlignments::readGAlignmentsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what)) |
|
422 | 452 |
} |
423 | 453 |
} else { |
424 | 454 |
if (pairedEndReads) { |
425 |
- data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq', flag=scanBamFlag(isDuplicate=FALSE))) |
|
455 |
+ data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what, flag=scanBamFlag(isDuplicate=FALSE))) |
|
426 | 456 |
} else { |
427 |
- data.raw <- GenomicAlignments::readGAlignmentsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq', flag=scanBamFlag(isDuplicate=FALSE))) |
|
457 |
+ data.raw <- GenomicAlignments::readGAlignmentsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what, flag=scanBamFlag(isDuplicate=FALSE))) |
|
428 | 458 |
} |
429 | 459 |
} |
460 |
+ stopTimedMessage(ptm) |
|
461 |
+ |
|
430 | 462 |
if (length(data.raw) == 0) { |
431 | 463 |
if (pairedEndReads) { |
432 | 464 |
stop(paste0("No reads imported. Does your file really contain paired end reads? Try with 'pairedEndReads=FALSE'")) |
... | ... |
@@ -435,11 +467,11 @@ bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndRe |
435 | 467 |
} |
436 | 468 |
## Filter by mapping quality |
437 | 469 |
if (pairedEndReads) { |
438 |
- message("Converting to GRanges ...", appendLF=FALSE); ptm <- proc.time() |
|
470 |
+ ptm <- startTimedMessage("Converting to GRanges ...") |
|
439 | 471 |
data <- as(data.raw, 'GRanges') # treat as one fragment |
440 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
472 |
+ stopTimedMessage(ptm) |
|
441 | 473 |
|
442 |
- message("Filtering reads ...", appendLF=FALSE); ptm <- proc.time() |
|
474 |
+ ptm <- startTimedMessage("Filtering reads ...") |
|
443 | 475 |
if (!is.null(min.mapq)) { |
444 | 476 |
mapq.first <- mcols(GenomicAlignments::first(data.raw))$mapq |
445 | 477 |
mapq.last <- mcols(GenomicAlignments::last(data.raw))$mapq |
... | ... |
@@ -448,16 +480,16 @@ bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndRe |
448 | 480 |
warning(paste0(bamfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads.")) |
449 | 481 |
} |
450 | 482 |
data <- data[which(mapq.mask)] |
451 |
- # Filter out too long fragments |
|
452 |
- data <- data[width(data)<=max.fragment.width] |
|
453 | 483 |
} |
454 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
484 |
+ # Filter out too long fragments |
|
485 |
+ data <- data[width(data)<=max.fragment.width] |
|
486 |
+ stopTimedMessage(ptm) |
|
455 | 487 |
} else { |
456 |
- message("Converting to GRanges ...", appendLF=FALSE); ptm <- proc.time() |
|
488 |
+ ptm <- startTimedMessage("Converting to GRanges ...") |
|
457 | 489 |
data <- as(data.raw, 'GRanges') |
458 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
490 |
+ stopTimedMessage(ptm) |
|
459 | 491 |
|
460 |
- message("Filtering reads ...", appendLF=FALSE); ptm <- proc.time() |
|
492 |
+ ptm <- startTimedMessage("Filtering reads ...") |
|
461 | 493 |
if (!is.null(min.mapq)) { |
462 | 494 |
if (any(is.na(mcols(data)$mapq))) { |
463 | 495 |
warning(paste0(bamfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads.")) |
... | ... |
@@ -465,8 +497,87 @@ bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndRe |
465 | 497 |
} |
466 | 498 |
data <- data[mcols(data)$mapq >= min.mapq] |
467 | 499 |
} |
468 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
500 |
+ # Filter out too long fragments |
|
501 |
+ data <- data[width(data)<=max.fragment.width] |
|
502 |
+ stopTimedMessage(ptm) |
|
503 |
+ } |
|
504 |
+ |
|
505 |
+ return(data) |
|
506 |
+ |
|
507 |
+} |
|
508 |
+ |
|
509 |
+ |
|
510 |
+#' Import BED file into GRanges |
|
511 |
+#' |
|
512 |
+#' Import aligned reads from a BED file into a \code{\link{GRanges}} object. |
|
513 |
+#' |
|
514 |
+#' @param bedfile A file with aligned reads in BED format. The columns have to be c('chromosome','start','end','description','mapq','strand'). |
|
515 |
+#' @param assembly Please see \code{\link[GenomeInfoDb]{fetchExtendedChromInfoFromUCSC}} for available assemblies. |
|
516 |
+#' @param chromosomes If only a subset of the chromosomes should be imported, specify them here. |
|
517 |
+#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed. |
|
518 |
+#' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=NULL} to keep all reads. |
|
519 |
+#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments. |
|
520 |
+#' @importFrom GenomeInfoDb fetchExtendedChromInfoFromUCSC |
|
521 |
+bed2GRanges <- function(bedfile, assembly, chromosomes=NULL, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000) { |
|
522 |
+ |
|
523 |
+ # File with reads, specify classes for faster import (0-based) |
|
524 |
+ ptm <- startTimedMessage("Reading file ",basename(bedfile)," ...") |
|
525 |
+ classes <- c('character','numeric','numeric','NULL','integer','character') |
|
526 |
+ data.raw <- read.table(bedfile, colClasses=classes) |
|
527 |
+ # Convert to GRanges object |
|
528 |
+ data <- GenomicRanges::GRanges(seqnames=Rle(data.raw[,1]), ranges=IRanges(start=data.raw[,2]+1, end=data.raw[,3]), strand=data.raw[,5]) # start+1 to go from [0,x) -> [1,x] |
|
529 |
+ mcols(data)$mapq <- data.raw[,4] |
|
530 |
+ remove(data.raw) |
|
531 |
+ chroms.in.data <- seqlevels(data) |
|
532 |
+ stopTimedMessage(ptm) |
|
533 |
+ # Get chromosome lengths |
|
534 |
+ ptm <- startTimedMessage("Fetching chromosome lengths from UCSC ...") |
|
535 |
+ df <- GenomeInfoDb::fetchExtendedChromInfoFromUCSC(assembly) |
|
536 |
+ chrom.lengths <- df$UCSC_seqlengths |
|
537 |
+ if (grepl('^chr',seqlevels(data)[1])) { |
|
538 |
+ names(chrom.lengths) <- df$UCSC_seqlevels |
|
539 |
+ } else { |
|
540 |
+ names(chrom.lengths) <- df$NCBI_seqlevels |
|
541 |
+ } |
|
542 |
+ seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
|
543 |
+ stopTimedMessage(ptm) |
|
544 |
+ |
|
545 |
+ ## Issue warnings and stuff for non-existing chromosomes |
|
546 |
+ chroms.in.data <- names(chrom.lengths) |
|
547 |
+ if (is.null(chromosomes)) { |
|
548 |
+ chromosomes <- chroms.in.data |
|
549 |
+ } |
|
550 |
+ chroms2use <- intersect(chromosomes, chroms.in.data) |
|
551 |
+ if (length(chroms2use)==0) { |
|
552 |
+ chrstring <- paste0(chromosomes, collapse=', ') |
|
553 |
+ stop('The specified chromosomes ', chrstring, ' do not exist in the data.') |
|
554 |
+ } |
|
555 |
+ ## Issue warning for non-existent chromosomes |
|
556 |
+ diff <- setdiff(chromosomes, chroms.in.data) |
|
557 |
+ if (length(diff)>0) { |
|
558 |
+ diffs <- paste0(diff, collapse=', ') |
|
559 |
+ warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.')) |
|
560 |
+ } |
|
561 |
+ |
|
562 |
+ ## Select only desired chromosomes |
|
563 |
+ data <- data[seqnames(data) %in% chroms2use] |
|
564 |
+ if (length(data) == 0) { |
|
565 |
+ stop(paste0('No reads imported!')) |
|
566 |
+ } |
|
567 |
+ |
|
568 |
+ ## Filter by mapping quality |
|
569 |
+ ptm <- startTimedMessage("Filtering reads ...") |
|
570 |
+ if (!is.null(min.mapq)) { |
|
571 |
+ if (any(is.na(mcols(data)$mapq))) { |
|
572 |
+ warning(paste0(bedfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads.")) |
|
573 |
+ mcols(data)$mapq[is.na(mcols(data)$mapq)] <- -1 |
|
574 |
+ } |
|
575 |
+ data <- data[mcols(data)$mapq >= min.mapq] |
|
469 | 576 |
} |
577 |
+ # Filter out too long fragments |
|
578 |
+ data <- data[width(data)<=max.fragment.width] |
|
579 |
+ stopTimedMessage(ptm) |
|
580 |
+ |
|
470 | 581 |
return(data) |
471 | 582 |
|
472 | 583 |
} |
... | ... |
@@ -34,7 +34,7 @@ correctGC <- function(binned.data.list, GC.BSgenome, same.GC.content=FALSE) { |
34 | 34 |
|
35 | 35 |
## Calculate GC content per bin |
36 | 36 |
if (same.GC.content & !same.GC.calculated | !same.GC.content) { |
37 |
- message("Calculating GC content per bin ...", appendLF=FALSE); ptm <- proc.time() |
|
37 |
+ ptm <- startTimedMessage("Calculating GC content per bin ...") |
|
38 | 38 |
GC.content <- list() |
39 | 39 |
for (chrom in seqlevels(binned.data)) { |
40 | 40 |
if (!grepl('^chr',chrom)) { |
... | ... |
@@ -57,12 +57,12 @@ correctGC <- function(binned.data.list, GC.BSgenome, same.GC.content=FALSE) { |
57 | 57 |
} |
58 | 58 |
GC.content <- unlist(GC.content) |
59 | 59 |
same.GC.calculated <- TRUE |
60 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
60 |
+ stopTimedMessage(ptm) |
|
61 | 61 |
} |
62 | 62 |
binned.data$GC <- GC.content |
63 | 63 |
|
64 | 64 |
### GC correction ### |
65 |
- message("GC correction ...", appendLF=FALSE); ptm <- proc.time() |
|
65 |
+ ptm <- startTimedMessage("GC correction ...") |
|
66 | 66 |
counts <- binned.data$counts |
67 | 67 |
mcounts <- binned.data$mcounts |
68 | 68 |
pcounts <- binned.data$pcounts |
... | ... |
@@ -107,7 +107,7 @@ correctGC <- function(binned.data.list, GC.BSgenome, same.GC.content=FALSE) { |
107 | 107 |
# Produce fit to check |
108 | 108 |
ggplt <- ggplot(df) + geom_point(aes_string(x='x', y='y', size='weight')) + geom_line(aes_string(x='x', y='y'), data=data.frame(x=gc.categories[intervals], y=fitted.correction.factors)) + theme_bw() + ggtitle('GC correction') + xlab('GC content') + ylab('correction factor') |
109 | 109 |
attr(binned.data, 'GC.correction.ggplt') <- ggplt |
110 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
110 |
+ stopTimedMessage(ptm) |
|
111 | 111 |
|
112 | 112 |
### Quality measures ### |
113 | 113 |
## Spikyness |
114 | 114 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,45 @@ |
1 |
+ |
|
2 |
+# ============================================================================ |
|
3 |
+# Functions for a Negative Binomial to transform (mean,variance)<->(size,prob) |
|
4 |
+# ============================================================================ |
|
5 |
+dnbinom.size <- function(mean, variance) { |
|
6 |
+ return(mean^2 / (variance - mean)) |
|
7 |
+} |
|
8 |
+ |
|
9 |
+dnbinom.prob <- function(mean, variance) { |
|
10 |
+ return(mean/variance) |
|
11 |
+} |
|
12 |
+ |
|
13 |
+dnbinom.mean <- function(size, prob) { |
|
14 |
+ return(size/prob - size) |
|
15 |
+} |
|
16 |
+ |
|
17 |
+dnbinom.variance <- function(size, prob) { |
|
18 |
+ return( (size - prob*size) / prob^2 ) |
|
19 |
+} |
|
20 |
+ |
|
21 |
+dgeom.mean <- function(prob) { |
|
22 |
+ return( (1-prob)/prob ) |
|
23 |
+} |
|
24 |
+ |
|
25 |
+dgeom.variance <- function(prob) { |
|
26 |
+ return( (1-prob)/prob^2 ) |
|
27 |
+} |
|
28 |
+ |
|
29 |
+dbinom.size <- function(mean, variance) { |
|
30 |
+ return( mean^2/(mean-variance) ) |
|
31 |
+} |
|
32 |
+ |
|
33 |
+dbinom.prob <- function(mean, variance) { |
|
34 |
+ return( (mean-variance)/mean ) |
|
35 |
+} |
|
36 |
+ |
|
37 |
+dbinom.mean <- function(size, prob) { |
|
38 |
+ return( size*prob ) |
|
39 |
+} |
|
40 |
+ |
|
41 |
+dbinom.variance <- function(size, prob) { |
|
42 |
+ return( size*prob * (1-prob) ) |
|
43 |
+} |
|
44 |
+ |
|
45 |
+ |
... | ... |
@@ -581,7 +581,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
581 | 581 |
message("Preparing bivariate HMM\n") |
582 | 582 |
|
583 | 583 |
## Pre-compute z-values for each number of counts |
584 |
- message("Computing pre z-matrix...", appendLF=FALSE); ptm <- proc.time() |
|
584 |
+ ptm <- startTimedMessage("Computing pre z-matrix...") |
|
585 | 585 |
z.per.count <- array(NA, dim=c(maxcounts+1, num.models, num.uni.states), dimnames=list(counts=0:maxcounts, strand=names(distributions), state=uni.states)) |
586 | 586 |
xcounts <- 0:maxcounts |
587 | 587 |
for (istrand in 1:num.models) { |
... | ... |
@@ -602,10 +602,10 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
602 | 602 |
z.per.count[ , istrand, istate] <- qnorm_u |
603 | 603 |
} |
604 | 604 |
} |
605 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
605 |
+ stopTimedMessage(ptm) |
|
606 | 606 |
|
607 | 607 |
## Compute the z matrix |
608 |
- message("Transfering values into z-matrix...", appendLF=FALSE); ptm <- proc.time() |
|
608 |
+ ptm <- startTimedMessage("Transfering values into z-matrix...") |
|
609 | 609 |
z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(distributions), state=uni.states)) |
610 | 610 |
for (istrand in 1:num.models) { |
611 | 611 |
for (istate in 1:num.uni.states) { |
... | ... |
@@ -613,10 +613,10 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
613 | 613 |
} |
614 | 614 |
} |
615 | 615 |
remove(z.per.count) |
616 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
616 |
+ stopTimedMessage(ptm) |
|
617 | 617 |
|
618 | 618 |
## Calculate correlation matrix |
619 |
- message("Computing inverse of correlation matrix...", appendLF=FALSE); ptm <- proc.time() |
|
619 |
+ ptm <- startTimedMessage("Computing inverse of correlation matrix...") |
|
620 | 620 |
correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(distributions), strand=names(distributions), comb.state=comb.states)) |
621 | 621 |
correlationMatrixInverse <- correlationMatrix |
622 | 622 |
determinant <- rep(0, num.comb.states) |
... | ... |
@@ -656,7 +656,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
656 | 656 |
usestateTF[comb.state] <- TRUE |
657 | 657 |
} |
658 | 658 |
} |
659 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
659 |
+ stopTimedMessage(ptm) |
|
660 | 660 |
|
661 | 661 |
# Use only states with valid correlationMatrixInverse (all states are valid in this version) |
662 | 662 |
correlationMatrix = correlationMatrix[,,usestateTF] |
... | ... |
@@ -667,7 +667,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
667 | 667 |
num.comb.states <- length(comb.states) |
668 | 668 |
|
669 | 669 |
### Calculate multivariate densities for each state ### |
670 |
- message("Calculating multivariate densities...", appendLF=FALSE); ptm <- proc.time() |
|
670 |
+ ptm <- startTimedMessage("Calculating multivariate densities...") |
|
671 | 671 |
densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states)) |
672 | 672 |
for (comb.state in comb.states) { |
673 | 673 |
istate <- which(comb.state==comb.states) |
... | ... |
@@ -706,7 +706,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
706 | 706 |
densities[icheck,] <- densities[icheck-1,] |
707 | 707 |
} |
708 | 708 |
} |
709 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
709 |
+ stopTimedMessage(ptm) |
|
710 | 710 |
|
711 | 711 |
### Define cleanup behaviour ### |
712 | 712 |
on.exit(.C("R_multivariate_cleanup", as.integer(num.comb.states))) |
... | ... |
@@ -760,7 +760,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
760 | 760 |
mcols(result$bins)[names[i1]] <- factor(matrix.states[,i1], levels=uni.states) |
761 | 761 |
} |
762 | 762 |
## Segmentation |
763 |
- message("Making segmentation ...", appendLF=FALSE); ptm <- proc.time() |
|
763 |
+ ptm <- startTimedMessage("Making segmentation ...") |
|
764 | 764 |
gr <- result$bins |
765 | 765 |
red.gr.list <- GRangesList() |
766 | 766 |
for (state in comb.states) { |
... | ... |
@@ -773,7 +773,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m |
773 | 773 |
red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list)) |
774 | 774 |
result$segments <- red.gr |
775 | 775 |
seqlengths(result$segments) <- seqlengths(result$bins) |
776 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
776 |
+ stopTimedMessage(ptm) |
|
777 | 777 |
## CNV state for both strands combined |
778 | 778 |
# Bins |
779 | 779 |
state <- multiplicity[result$bins$mstate] + multiplicity[result$bins$pstate] |
... | ... |
@@ -27,7 +27,7 @@ getSegments <- function(hmms, cluster=TRUE, classes=NULL) { |
27 | 27 |
|
28 | 28 |
## Clustering |
29 | 29 |
if (cluster) { |
30 |
- message("Making consensus template ...", appendLF=FALSE); ptm <- proc.time() |
|
30 |
+ ptm <- startTimedMessage("Making consensus template ...") |
|
31 | 31 |
consensus <- disjoin(unlist(grlred)) |
32 | 32 |
constates <- matrix(NA, ncol=length(grlred), nrow=length(consensus)) |
33 | 33 |
for (i1 in 1:length(grlred)) { |
... | ... |
@@ -38,15 +38,15 @@ getSegments <- function(hmms, cluster=TRUE, classes=NULL) { |
38 | 38 |
} |
39 | 39 |
meanstates <- apply(constates, 1, mean, na.rm=TRUE) |
40 | 40 |
mcols(consensus)$meanstate <- meanstates |
41 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
41 |
+ stopTimedMessage(ptm) |
|
42 | 42 |
|
43 | 43 |
# Distance measure |
44 | 44 |
# Use covariance instead of correlation to avoid NaNs for which the hclust fails with error |
45 |
- message("clustering ...", appendLF=FALSE); ptm <- proc.time() |
|
45 |
+ ptm <- startTimedMessage("clustering ...") |
|
46 | 46 |
constates[is.na(constates)] <- 0 |
47 | 47 |
wcor <- cov.wt(constates, wt=as.numeric(width(consensus))) |
48 | 48 |
dist <- as.dist(max(wcor$cov)-wcor$cov) |
49 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
49 |
+ stopTimedMessage(ptm) |
|
50 | 50 |
# Dendrogram |
51 | 51 |
message("reordering ...") |
52 | 52 |
hc <- hclust(dist) |
... | ... |
@@ -14,47 +14,3 @@ class.multivariate.hmm <- "aneuMultiHMM" |
14 | 14 |
class.bivariate.hmm <- "aneuBiHMM" |
15 | 15 |
class.hmm.list <- "aneufinder.hmm.list" |
16 | 16 |
|
17 |
-# ============================================================================ |
|
18 |
-# Functions for a Negative Binomial to transform (mean,variance)<->(size,prob) |
|
19 |
-# ============================================================================ |
|
20 |
-dnbinom.size <- function(mean, variance) { |
|
21 |
- return(mean^2 / (variance - mean)) |
|
22 |
-} |
|
23 |
- |
|
24 |
-dnbinom.prob <- function(mean, variance) { |
|
25 |
- return(mean/variance) |
|
26 |
-} |
|
27 |
- |
|
28 |
-dnbinom.mean <- function(size, prob) { |
|
29 |
- return(size/prob - size) |
|
30 |
-} |
|
31 |
- |
|
32 |
-dnbinom.variance <- function(size, prob) { |
|
33 |
- return( (size - prob*size) / prob^2 ) |
|
34 |
-} |
|
35 |
- |
|
36 |
-dgeom.mean <- function(prob) { |
|
37 |
- return( (1-prob)/prob ) |
|
38 |
-} |
|
39 |
- |
|
40 |
-dgeom.variance <- function(prob) { |
|
41 |
- return( (1-prob)/prob^2 ) |
|
42 |
-} |
|
43 |
- |
|
44 |
-dbinom.size <- function(mean, variance) { |
|
45 |
- return( mean^2/(mean-variance) ) |
|
46 |
-} |
|
47 |
- |
|
48 |
-dbinom.prob <- function(mean, variance) { |
|
49 |
- return( (mean-variance)/mean ) |
|
50 |
-} |
|
51 |
- |
|
52 |
-dbinom.mean <- function(size, prob) { |
|
53 |
- return( size*prob ) |
|
54 |
-} |
|
55 |
- |
|
56 |
-dbinom.variance <- function(size, prob) { |
|
57 |
- return( size*prob * (1-prob) ) |
|
58 |
-} |
|
59 |
- |
|
60 |
- |
61 | 17 |
similarity index 87% |
62 | 18 |
rename from R/convert2GRanges.R |
63 | 19 |
rename to R/importBed.R |
... | ... |
@@ -2,14 +2,14 @@ |
2 | 2 |
|
3 | 3 |
#' Read bed-file into GRanges |
4 | 4 |
#' |
5 |
-#' This is a simple convenience function to read a (compressed) bed-file into a \code{\link{GRanges}} object. The bed-file is expected to have the following fields: \code{chrom, chromStart, chromEnd, name, score, strand}. |
|
5 |
+#' This is a simple convenience function to read a (compressed) bed-file into a \code{\link{GRanges}} object. The bed-file is expected to have the following fields: \code{chromosome, start, end, name, score, strand}. |
|
6 | 6 |
#' |
7 | 7 |
#' @param bedfile Filename of the bed or bed.gz file. |
8 | 8 |
#' @param skip Number of lines to skip at the beginning. |
9 | 9 |
#' @return A \code{\link{GRanges}} object with the contents of the bed-file. |
10 | 10 |
#' @author Aaron Taudt |
11 | 11 |
#' @export |
12 |
-bed2GRanges <- function(bedfile, skip=0) { |
|
12 |
+importBed <- function(bedfile, skip=0) { |
|
13 | 13 |
|
14 | 14 |
# File with reads, determine classes first for faster import (0-based) |
15 | 15 |
tab5rows <- read.table(bedfile, nrows=5, skip=skip) |
... | ... |
@@ -21,7 +21,7 @@ karyotypeMeasures <- function(hmms, normalChromosomeNumbers=NULL) { |
21 | 21 |
hmms <- loadHmmsFromFiles(hmms) |
22 | 22 |
|
23 | 23 |
## If all binsizes are the same the consensus template can be chosen equal to the bins |
24 |
- message("Making consensus template ...", appendLF=FALSE); ptm <- proc.time() |
|
24 |
+ ptm <- startTimedMessage("Making consensus template ...") |
|
25 | 25 |
binsizes <- unlist(lapply(hmms, function(x) { width(x$bins)[1] })) |
26 | 26 |
if (all(binsizes==binsizes[1])) { |
27 | 27 |
consensus <- hmms[[1]]$bins |
... | ... |
@@ -53,7 +53,7 @@ karyotypeMeasures <- function(hmms, normalChromosomeNumbers=NULL) { |
53 | 53 |
} |
54 | 54 |
meanstates <- apply(constates, 1, mean, na.rm=TRUE) |
55 | 55 |
mcols(consensus)$meanstate <- meanstates |
56 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
56 |
+ stopTimedMessage(ptm) |
|
57 | 57 |
|
58 | 58 |
|
59 | 59 |
### Karyotype measures ### |
... | ... |
@@ -14,7 +14,7 @@ loadHmmsFromFiles <- function(hmms, strict=FALSE) { |
14 | 14 |
if (is.hmm(hmms) | is.bihmm(hmms)) { |
15 | 15 |
return(list(hmms)) |
16 | 16 |
} else if (is.character(hmms)) { |
17 |
- message("Loading univariate HMMs from files ...", appendLF=FALSE); ptm <- proc.time() |
|
17 |
+ ptm <- startTimedMessage("Loading univariate HMMs from files ...") |
|
18 | 18 |
mlist <- list() |
19 | 19 |
for (modelfile in hmms) { |
20 | 20 |
tC <- tryCatch({ |
... | ... |
@@ -24,7 +24,7 @@ loadHmmsFromFiles <- function(hmms, strict=FALSE) { |
24 | 24 |
}) |
25 | 25 |
if (!is.hmm(mlist[[modelfile]]) & !is.bihmm(mlist[[modelfile]])) { |
26 | 26 |
if (strict) { |
27 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
27 |
+ stopTimedMessage(ptm) |
|
28 | 28 |
stop("File ",modelfile," does not contain an ",class.univariate.hmm," object.") |
29 | 29 |
} else { |
30 | 30 |
class(mlist[[modelfile]]) <- class.univariate.hmm |
... | ... |
@@ -32,7 +32,7 @@ loadHmmsFromFiles <- function(hmms, strict=FALSE) { |
32 | 32 |
} |
33 | 33 |
} |
34 | 34 |
} |
35 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
35 |
+ stopTimedMessage(ptm) |
|
36 | 36 |
return(mlist) |
37 | 37 |
} else if (is.list(hmms)) { |
38 | 38 |
index <- which(unlist(lapply(hmms, function(hmm) { !is.hmm(hmm) & !is.bihmm(hmm) }))) |
... | ... |
@@ -80,7 +80,7 @@ loadGRangesFromFiles <- function(files) { |
80 | 80 |
if (class(gr.list)=='GRanges') { |
81 | 81 |
return(list(gr.list)) |
82 | 82 |
} else if (is.character(gr.list)) { |
83 |
- message("Loading GRanges from files ...", appendLF=FALSE); ptm <- proc.time() |
|
83 |
+ ptm <- startTimedMessage("Loading GRanges from files ...") |
|
84 | 84 |
mlist <- list() |
85 | 85 |
for (modelfile in gr.list) { |
86 | 86 |
tC <- tryCatch({ |
... | ... |
@@ -89,11 +89,11 @@ loadGRangesFromFiles <- function(files) { |
89 | 89 |
stop(modelfile,'\n',err) |
90 | 90 |
}) |
91 | 91 |
if (class(mlist[[modelfile]])!='GRanges') { |
92 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
92 |
+ stopTimedMessage(ptm) |
|
93 | 93 |
stop("File ",modelfile," does not contain a GRanges object.") |
94 | 94 |
} |
95 | 95 |
} |
96 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
96 |
+ stopTimedMessage(ptm) |
|
97 | 97 |
return(mlist) |
98 | 98 |
} else if (is.list(gr.list)) { |
99 | 99 |
index <- which(unlist(lapply(gr.list, function(hmm) { class(hmm)!='GRanges' }))) |
... | ... |
@@ -636,7 +636,7 @@ heatmapAneuploidies <- function(hmms, ylabels=NULL, cluster=TRUE, as.data.frame= |
636 | 636 |
} |
637 | 637 |
|
638 | 638 |
## Find the most frequent state (mfs) for each chromosome and sample |
639 |
- message("finding most frequent state for each sample and chromosome ...", appendLF=FALSE); ptm <- proc.time() |
|
639 |
+ ptm <- startTimedMessage("finding most frequent state for each sample and chromosome ...") |
|
640 | 640 |
grl.per.chrom <- lapply(grlred, function(x) { split(x, seqnames(x)) }) |
641 | 641 |
mfs.samples <- list() |
642 | 642 |
for (i1 in 1:length(grlred)) { |
... | ... |
@@ -651,7 +651,7 @@ heatmapAneuploidies <- function(hmms, ylabels=NULL, cluster=TRUE, as.data.frame= |
651 | 651 |
attr(mfs.samples[[names(grlred)[i1]]], "varname") <- 'chromosome' |
652 | 652 |
} |
653 | 653 |
attr(mfs.samples, "varname") <- 'sample' |
654 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
654 |
+ stopTimedMessage(ptm) |
|
655 | 655 |
|
656 | 656 |
## Transform to data.frame |
657 | 657 |
# Long format |
... | ... |
@@ -766,15 +766,15 @@ heatmapGenomewide <- function(hmms, ylabels=NULL, classes=NULL, classes.color=NU |
766 | 766 |
} |
767 | 767 |
|
768 | 768 |
## Transform coordinates from "chr, start, end" to "genome.start, genome.end" |
769 |
- message("transforming coordinates ...", appendLF=FALSE); ptm <- proc.time() |
|
769 |
+ ptm <- startTimedMessage("transforming coordinates ...") |
|
770 | 770 |
grlred <- endoapply(grlred, transCoord) |
771 | 771 |
if (plot.SCE) { |
772 | 772 |
sce <- endoapply(sce, transCoord) |
773 | 773 |
} |
774 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
774 |
+ stopTimedMessage(ptm) |
|
775 | 775 |
|
776 | 776 |
## Data.frame for plotting |
777 |
- message("Making the plot ...", appendLF=FALSE); ptm <- proc.time() |
|
777 |
+ ptm <- startTimedMessage("Making the plot ...") |
|
778 | 778 |
# Data |
779 | 779 |
df <- list() |
780 | 780 |
for (i1 in 1:length(grlred)) { |
... | ... |
@@ -837,13 +837,13 @@ heatmapGenomewide <- function(hmms, ylabels=NULL, classes=NULL, classes.color=NU |
837 | 837 |
widths[[length(widths)+1]] <- width.dendro |
838 | 838 |
} |
839 | 839 |
cowplot <- cowplot::plot_grid(plotlist=rev(pltlist), align='h', ncol=length(pltlist), rel_widths=rev(widths)) |
840 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
840 |
+ stopTimedMessage(ptm) |
|
841 | 841 |
|
842 | 842 |
## Plot to file |
843 | 843 |
if (!is.null(file)) { |
844 |
- message("plotting to file ",file," ...", appendLF=FALSE); ptm <- proc.time() |
|
844 |
+ ptm <- startTimedMessage("plotting to file ",file," ...") |
|
845 | 845 |
ggsave(file, cowplot, width=sum(widths)/2.54, height=height/2.54, limitsize=FALSE) |
846 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
846 |
+ stopTimedMessage(ptm) |
|
847 | 847 |
} else { |
848 | 848 |
return(cowplot) |
849 | 849 |
} |
... | ... |
@@ -915,12 +915,12 @@ plot.array <- function(model, both.strands=FALSE, plot.SCE=TRUE, file=NULL) { |
915 | 915 |
} |
916 | 916 |
|
917 | 917 |
## Transform coordinates from "chr, start, end" to "genome.start, genome.end" |
918 |
- message("transforming coordinates ...", appendLF=FALSE); ptm <- proc.time() |
|
918 |
+ ptm <- startTimedMessage("transforming coordinates ...") |
|
919 | 919 |
bins <- transCoord(bins) |
920 | 920 |
if (plot.SCE) { |
921 | 921 |
scecoords <- transCoord(scecoords) |
922 | 922 |
} |
923 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
923 |
+ stopTimedMessage(ptm) |
|
924 | 924 |
|
925 | 925 |
# Quality info |
926 | 926 |
if (is.null(model$qualityInfo$complexity)) { model$qualityInfo$complexity <- NA } |
... | ... |
@@ -97,9 +97,9 @@ clusterByQuality <- function(hmms, G=1:9, itmax=c(100,100), measures=c('spikynes |
97 | 97 |
hmms <- loadHmmsFromFiles(hmms) |
98 | 98 |
df <- getQC(hmms) |
99 | 99 |
df <- df[measures] |
100 |
- message("clustering ...", appendLF=FALSE); ptm <- proc.time() |
|
100 |
+ ptm <- startTimedMessage("clustering ...") |
|
101 | 101 |
fit <- mclust::Mclust(df, G=G, control=emControl(itmax=itmax)) |
102 |
- time <- proc.time() - ptm; message(" ",round(time[3],2),"s") |
|
102 |
+ stopTimedMessage(ptm) |
|
103 | 103 |
params <- t(fit$parameters$mean) |
104 | 104 |
classification <- split(names(fit$classification), fit$classification) |
105 | 105 |
## Reorder clusters |
106 | 106 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,86 @@ |
1 |
+ |
|
2 |
+ |
|
3 |
+#' Simulate reads from genome |
|
4 |
+#' |
|
5 |
+#' Simulate single or paired end reads from any \pkg{\link[BSgenome]{BSgenome}} object. These simulated reads can be mapped to the reference genome using any aligner to produce BAM files that can be used for mappability correction. |
|
6 |
+#' |
|
7 |
+#' Reads are simulated by splitting the genome into reads with the specified \code{readLength}. |
|
8 |
+#' |
|
9 |
+#' @param bsgenome A \pkg{\link[BSgenome]{BSgenome}} object containing the sequence of the reference genome. |
|
10 |
+#' @param readLength The length in base pairs of the simulated reads that are written to file. |
|
11 |
+#' @param bamfile A BAM file. This file is used to estimate the distribution of Phred quality scores. |
|
12 |
+#' @param file The filename that is written to disk. The ending .fastq.gz will be appended. |
|
13 |
+#' @param pairedEndFragmentLength If this option is specified, paired end reads with length \code{readLength} will be simulated coming from both ends of fragments of this size. NOT IMPLEMENTED YET. |
|
14 |
+#' @param every.X.bp Stepsize for simulating reads. A read fragment will be simulated every X bp. |
|
15 |
+#' @return A fastq.gz file is written to disk. |
|
16 |
+#' @author Aaron Taudt |
|
17 |
+#' @importFrom Biostrings DNAStringSet BString BStringSet |
|
18 |
+#' @export |
|
19 |
+simulateReads <- function(bsgenome, readLength, bamfile, file, pairedEndFragmentLength=NULL, every.X.bp=500) { |
|
20 |
+ |
|
21 |
+ if (!is.null(pairedEndFragmentLength)) { |
|
22 |
+ pairedEndReads <- TRUE |
|
23 |
+ } else { |
|
24 |
+ pairedEndReads <- FALSE |
|
25 |
+ } |
|
26 |
+ |
|
27 |
+ ## Estimate quality score distribution |
|
28 |
+ ptm <- startTimedMessage("Estimating quality score distribution ...") |
|
29 |
+ gr <- suppressMessages( bam2GRanges(bamfile, pairedEndReads=pairedEndReads, remove.duplicate.reads=FALSE, min.mapq=NULL, what=c('mapq','seq','qual')) ) |
|
30 |
+ seqbp <- unlist(strsplit(as.character(gr$seq),'')) |
|
31 |
+ mapbp <- unlist(strsplit(as.character(gr$qual),'')) |
|
32 |
+ qualdistr <- table(mapbp, seqbp) |
|
33 |
+ qualdistr <- sweep(qualdistr, MARGIN=2, STATS=colSums(qualdistr), FUN='/') |
|
34 |
+ remove(seqbp, mapbp) |
|
35 |
+ stopTimedMessage(ptm) |
|
36 |
+ |
|
37 |
+ bases <- colnames(qualdistr) |
|
38 |
+ missing.bases <- setdiff(bases, c('A','T','C','G','N')) |
|
39 |
+ if (length(missing.bases)>0) { |
|
40 |
+ stop("'bamfile' does not contain bases ", paste(missing.bases, collapse=', ')) |
|
41 |
+ } |
|
42 |
+ |
|
43 |
+ ## Simulate the reads |
|
44 |
+ file.gz <- gzfile(paste0(file,'.fastq.gz'), 'w') |
|
45 |
+ cat('', file=file.gz) |
|
46 |
+ identifier.string <- attributes(bsgenome)$pkgname |
|
47 |
+ itotal <- 1 |
|
48 |
+ for (chrom in seqlevels(bsgenome)) { |
|
49 |
+ message("Simulating reads for chromosome ",chrom) |
|
50 |
+ ## Simulate the reads |
|
51 |
+ ptm <- startTimedMessage(" Simulating reads ...") |
|
52 |
+ starts <- seq(1, seqlengths(bsgenome)[chrom]-readLength+1L, by=every.X.bp) |
|
53 |
+ reads <- Biostrings::DNAStringSet(bsgenome[[chrom]], start=starts, width=readLength) |
|
54 |
+ mask.N <- reads == paste0(rep('N', readLength), collapse='') |
|
55 |
+ reads <- reads[!mask.N] |
|
56 |
+ stopTimedMessage(ptm) |
|
57 |
+ ## Constructing quality fields |
|
58 |
+ ptm <- startTimedMessage(" Constructing quality fields ...") |
|
59 |
+ reads.full <- unlist(reads) |
|
60 |
+ reads.split <- Biostrings::DNAStringSet(reads.full, start=1:length(reads.full), width=1) |
|
61 |
+ quality.char <- character(length=length(reads.split)) |
|
62 |
+ for (base in bases) { |
|
63 |
+ idx <- which(reads.split == base) |
|
64 |
+ quality.char[idx] <- sample(rownames(qualdistr), size=length(idx), replace=TRUE, prob=qualdistr[,base]) |
|
65 |
+ } |
|
66 |
+ quality <- Biostrings::BString(paste0(quality.char, collapse='')) |
|
67 |
+ starts <- seq(1, length(reads.split)-readLength+1L, by=every.X.bp) |
|
68 |
+ quality <- Biostrings::BStringSet(quality, start=starts, width=readLength) |
|
69 |
+ remove(reads.full, reads.split) |
|
70 |
+ stopTimedMessage(ptm) |
|
71 |
+ ## Construct ID fields |
|
72 |
+ ptm <- startTimedMessage(" Constructing ID fields ...") |
|
73 |
+ id.strings <- paste0(identifier.string, '_', itotal:(itotal+length(reads)-1), ' simulated length=',readLength) |
|
74 |
+ itotal <- itotal + length(reads) |
|
75 |
+ stopTimedMessage(ptm) |
|
76 |
+ ## Rearrange into @ID\nSeq\n+ID\nQual |
|
77 |
+ ptm <- startTimedMessage(" Rearranging for writing ...") |
|
78 |
+ data <- paste0('@',id.strings,'\n', as.character(reads),'\n', '+',id.strings,'\n', as.character(quality),'\n') |
|
79 |
+ stopTimedMessage(ptm) |
|
80 |
+ ptm <- startTimedMessage(" Writing to file ...") |
|
81 |
+ cat(data, file=file.gz, sep='', append=T) |
|
82 |
+ stopTimedMessage(ptm) |
|
83 |
+ } |
|
84 |
+ close(file.gz) |
|
85 |
+ |
|
86 |
+} |
0 | 87 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,18 @@ |
1 |
+ |
|
2 |
+ |
|
3 |
+startTimedMessage <- function(...) { |
|
4 |
+ |
|
5 |
+ x <- paste0(..., collapse='') |
|
6 |
+ message(x, appendLF=FALSE) |
|
7 |
+ ptm <- proc.time() |
|
8 |
+ return(ptm) |
|
9 |
+ |
|
10 |
+} |
|
11 |
+ |
|
12 |
+ |
|
13 |
+stopTimedMessage <- function(ptm) { |
|
14 |
+ |
|
15 |
+ time <- proc.time() - ptm |
|
16 |
+ message(" ", round(time[3],2), "s") |
|
17 |
+ |
|
18 |
+} |
0 | 19 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,42 @@ |
1 |
+ |
|
2 |
+ |
|
3 |
+#' Make variably sized bins |
|
4 |
+#' |
|
5 |
+#' Make variable-width bins based on a reference BAM file. This can be a simulated file (produced by \code{\link{simulateReads}} and aligned with your favourite aligner) or a real reference. |
|
6 |
+#' |
|
7 |
+#' Variable-width bins are produced by first binning the reference BAM file with fixed-width bins and selecting the desired number of reads per bin as the (non-zero) maximum of the histogram. A new set of bins is then generated such that every bin contains the desired number of reads. |
|
8 |
+#' |
|
9 |
+#' @inheritParams bam2GRanges |
|
10 |
+#' @param binsizes A vector with binsizes. Resulting bins will be close to the specified binsizes. |
|
11 |
+#' @return A \code{list} with one \code{\link{GRanges}} object for each element in \code{binsizes}. |
|
12 |
+#' @author Aaron Taudt |
|
13 |
+#' @export |
|
14 |
+variableWidthBins <- function(bamfile, bamindex=bamfile, binsizes, chromosomes=NULL, pairedEndReads=FALSE, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000) { |
|
15 |
+ |
|
16 |
+ ## Make fixed width bins |
|
17 |
+ reads <- bam2GRanges(bamfile, bamindex=bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, max.fragment.width=max.fragment.width) |
|
18 |
+ ptm <- startTimedMessage(paste0("Binning ", bamfile, " ...")) |
|
19 |
+ binned.list <- suppressMessages( bam2binned(reads, binsizes=binsizes, calc.complexity=FALSE, chromosomes=chromosomes) ) |
|
20 |
+ stopTimedMessage(ptm) |
|
21 |
+ |
|
22 |
+ ## Loop over binsizes |
|
23 |
+ bins.list <- list() |
|
24 |
+ for (i1 in 1:length(binsizes)) { |
|
25 |
+ binsize <- binsizes[i1] |
|
26 |
+ binned <- binned.list[[i1]] |
|
27 |
+ ## Get mode of histogram |
|
28 |
+ tab <- table(binned$counts) |
|
29 |
+ modecount <- as.integer(names(which.max(tab[names(tab)!=0]))) |
|
30 |
+ ## Pick only every modecount read |
|
31 |
+ idx <- seq(modecount, length(reads), by=modecount) |
|
32 |
+ subreads <- reads[idx] |
|
33 |
+ strand(subreads) <- '*' |
|
34 |
+ ## Make new bins |
|
35 |
+ bins <- gaps(subreads) |
|
36 |
+ bins <- bins[strand(bins)=='*'] |
|
37 |
+ bins.list[[as.character(binsize)]] <- bins |
|
38 |
+ } |
|
39 |
+ |
|
40 |
+ return(bins.list) |
|
41 |
+ |
|
42 |
+} |