Browse code

added variable-width bins, timedMessage, simulateReads

chakalakka authored on 17/02/2016 09:09:42
Showing 28 changed files

... ...
@@ -7,7 +7,7 @@ Author: Aaron Taudt, Bjorn Bakker, David Porubsky
7 7
 Maintainer: Aaron Taudt <aaron.taudt@gmail.com>
8 8
 Description: This package implements functions for CNV calling, plotting, export and analysis from whole-genome single cell sequencing data.
9 9
 Depends: R (>= 3.1.0), GenomicRanges, cowplot
10
-Imports: foreach, doParallel, BiocGenerics, IRanges, Rsamtools, Biostrings, GenomicAlignments, preseqR, reshape2, ggdendro, ReorderCluster, mclust
10
+Imports: foreach, doParallel, GenomeInfoDb, BiocGenerics, IRanges, Rsamtools, Biostrings, GenomicAlignments, preseqR, reshape2, ggdendro, ReorderCluster, mclust
11 11
 Suggests: knitr, testthat
12 12
 License: file LICENSE
13 13
 LazyLoad: yes
... ...
@@ -6,7 +6,7 @@ S3method(plot,aneuHMM)
6 6
 S3method(plot,character)
7 7
 export(Aneufinder)
8 8
 export(bam2binned)
9
-export(bed2GRanges)
9
+export(bed2binned)
10 10
 export(clusterByQuality)
11 11
 export(concGRangesInRange)
12 12
 export(correctGC)
... ...
@@ -22,13 +22,16 @@ export(getSCEcoordinates)
22 22
 export(heatmapAneuploidies)
23 23
 export(heatmapGenomewide)
24 24
 export(hotspotter)
25
+export(importBed)
25 26
 export(karyotypeMeasures)
26 27
 export(loadGRangesFromFiles)
27 28
 export(loadHmmsFromFiles)
29
+export(simulateReads)
28 30
 export(smoothBinning)
29 31
 export(stateColors)
30 32
 export(strandColors)
31 33
 export(subsetByCNVprofile)
34
+export(variableWidthBins)
32 35
 import(GenomicRanges)
33 36
 import(IRanges)
34 37
 import(cowplot)
... ...
@@ -38,8 +41,12 @@ import(ggdendro)
38 41
 import(preseqR)
39 42
 import(reshape2)
40 43
 importFrom(BiocGenerics,as.vector)
44
+importFrom(Biostrings,BString)
45
+importFrom(Biostrings,BStringSet)
46
+importFrom(Biostrings,DNAStringSet)
41 47
 importFrom(Biostrings,Views)
42 48
 importFrom(Biostrings,alphabetFrequency)
49
+importFrom(GenomeInfoDb,fetchExtendedChromInfoFromUCSC)
43 50
 importFrom(GenomicAlignments,first)
44 51
 importFrom(GenomicAlignments,readGAlignmentPairsFromBam)
45 52
 importFrom(GenomicAlignments,readGAlignmentsFromBam)
... ...
@@ -113,7 +113,7 @@ doParallel::registerDoParallel(cl)
113 113
 if (!file.exists(binpath.uncorrected)) { dir.create(binpath.uncorrected) }
114 114
 files <- list.files(inputfolder, full.names=TRUE, recursive=TRUE, pattern=paste0('.',conf[['format']],'$'))
115 115
 ### Binning ###
116
-message("Binning the data ...", appendLF=FALSE); ptm <- proc.time()
116
+ptm <- startTimedMessage("Binning the data ...")
117 117
 temp <- foreach (file = files, .packages=c('aneufinder')) %dopar% {
118 118
 	existing.binfiles <- grep(basename(file), list.files(binpath.uncorrected), value=TRUE)
119 119
 	existing.binsizes <- as.numeric(unlist(lapply(strsplit(existing.binfiles, split='binsize_|_reads.per.bin_|_\\.RData'), '[[', 2)))
... ...
@@ -143,10 +143,10 @@ temp <- foreach (file = files, .packages=c('aneufinder')) %dopar% {
143 143
 		})
144 144
 	}
145 145
 }
146
-time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
146
+stopTimedMessage(ptm)
147 147
 
148 148
 ### Export read fragments as browser file ###
149
-message("Exporting data as browser files ...", appendLF=FALSE); ptm <- proc.time()
149
+ptm <- startTimedMessage("Exporting data as browser files ...")
150 150
 if (!file.exists(readsbrowserpath)) { dir.create(readsbrowserpath) }
151 151
 readfiles <- list.files(readspath,pattern='.RData$',full.names=TRUE)
152 152
 temp <- foreach (file = readfiles, .packages=c('aneufinder')) %dopar% {
... ...
@@ -160,14 +160,14 @@ temp <- foreach (file = readfiles, .packages=c('aneufinder')) %dopar% {
160 160
 		})
161 161
 	}
162 162
 }
163
-time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
163
+stopTimedMessage(ptm)
164 164
 
165 165
 #=================
166 166
 ### Correction ###
167 167
 #=================
168 168
 if (!is.null(conf[['correction.method']])) {
169 169
 
170
-	message(paste0(conf[['correction.method']]," correction ..."), appendLF=FALSE); ptm <- proc.time()
170
+	ptm <- startTimedMessage(paste0(conf[['correction.method']]," correction ..."))
171 171
 	if (conf[['correction.method']]=='GC') {
172 172
 		binpath.corrected <- file.path(outputfolder,'binned_GC-corrected')
173 173
 		if (!file.exists(binpath.corrected)) { dir.create(binpath.corrected) }
... ...
@@ -202,7 +202,7 @@ if (!is.null(conf[['correction.method']])) {
202 202
 		}
203 203
 		binpath <- binpath.corrected
204 204
 	}
205
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
205
+	stopTimedMessage(ptm)
206 206
 
207 207
 } else {
208 208
 	binpath <- binpath.uncorrected
... ...
@@ -213,7 +213,7 @@ if (!is.null(conf[['correction.method']])) {
213 213
 #===============
214 214
 if ('univariate' %in% conf[['method']]) {
215 215
 
216
-	message("Running univariate HMMs ...", appendLF=FALSE); ptm <- proc.time()
216
+	ptm <- startTimedMessage("Running univariate HMMs ...")
217 217
 	if (!file.exists(CNVpath)) { dir.create(CNVpath) }
218 218
 
219 219
 	files <- list.files(binpath, full.names=TRUE, recursive=TRUE, pattern='.RData$')
... ...
@@ -228,7 +228,7 @@ if ('univariate' %in% conf[['method']]) {
228 228
 			stop(file,'\n',err)
229 229
 		})
230 230
 	}
231
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
231
+	stopTimedMessage(ptm)
232 232
 
233 233
 	#===================
234 234
 	### Plotting CNV ###
... ...
@@ -241,7 +241,7 @@ if ('univariate' %in% conf[['method']]) {
241 241
 	#-----------------------
242 242
 	## Plot distributions ##
243 243
 	#-----------------------
244
-	message("Plotting distributions ...", appendLF=FALSE); ptm <- proc.time()
244
+	ptm <- startTimedMessage("Plotting distributions ...")
245 245
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
246 246
 		savename <- file.path(CNVplotpath,paste0('distributions_',sub('_$','',pattern),'.pdf'))
247 247
 		if (!file.exists(savename)) {
... ...
@@ -259,11 +259,11 @@ if ('univariate' %in% conf[['method']]) {
259 259
 			d <- dev.off()
260 260
 		}
261 261
 	}
262
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
262
+	stopTimedMessage(ptm)
263 263
 	#------------------
264 264
 	## Plot heatmaps ##
265 265
 	#------------------
266
-	message("Plotting genomewide heatmaps ...", appendLF=FALSE); ptm <- proc.time()
266
+	ptm <- startTimedMessage("Plotting genomewide heatmaps ...")
267 267
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
268 268
 		ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE)
269 269
 		ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=TRUE)
... ...
@@ -276,8 +276,8 @@ if ('univariate' %in% conf[['method']]) {
276 276
 			warning("Plotting genomewide heatmaps: No files for pattern ",pattern," found.")
277 277
 		}
278 278
 	}
279
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
280
-	message("Plotting chromosome heatmaps ...", appendLF=FALSE); ptm <- proc.time()
279
+	stopTimedMessage(ptm)
280
+	ptm <- startTimedMessage("Plotting chromosome heatmaps ...")
281 281
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
282 282
 		ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE)
283 283
 		ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=TRUE)
... ...
@@ -293,11 +293,11 @@ if ('univariate' %in% conf[['method']]) {
293 293
 			warning("Plotting chromosome heatmaps: No files for pattern ",pattern," found.")
294 294
 		}
295 295
 	}
296
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
296
+	stopTimedMessage(ptm)
297 297
 	#------------------
298 298
 	## Plot arrayCGH ##
299 299
 	#------------------
300
-	message("Making arrayCGH plots ...", appendLF=FALSE); ptm <- proc.time()
300
+	ptm <- startTimedMessage("Making arrayCGH plots ...")
301 301
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
302 302
 		savename <- file.path(CNVplotpath,paste0('arrayCGH_',sub('_$','',pattern),'.pdf'))
303 303
 		if (!file.exists(savename)) {
... ...
@@ -315,11 +315,11 @@ if ('univariate' %in% conf[['method']]) {
315 315
 			d <- dev.off()
316 316
 		}
317 317
 	}
318
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
318
+	stopTimedMessage(ptm)
319 319
 	#--------------------
320 320
 	## Plot karyograms ##
321 321
 	#--------------------
322
-	message("Plotting karyograms ...", appendLF=FALSE); ptm <- proc.time()
322
+	ptm <- startTimedMessage("Plotting karyograms ...")
323 323
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
324 324
 		savename <- file.path(CNVplotpath,paste0('karyograms_',sub('_$','',pattern),'.pdf'))
325 325
 		if (!file.exists(savename)) {
... ...
@@ -337,11 +337,11 @@ if ('univariate' %in% conf[['method']]) {
337 337
 			d <- dev.off()
338 338
 		}
339 339
 	}
340
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
340
+	stopTimedMessage(ptm)
341 341
 	#-------------------------
342 342
 	## Export browser files ##
343 343
 	#-------------------------
344
-	message("Exporting browser files ...", appendLF=FALSE); ptm <- proc.time()
344
+	ptm <- startTimedMessage("Exporting browser files ...")
345 345
 	if (!file.exists(CNVbrowserpath)) { dir.create(CNVbrowserpath) }
346 346
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
347 347
 		savename <- file.path(CNVbrowserpath,sub('_$','',pattern))
... ...
@@ -351,7 +351,7 @@ if ('univariate' %in% conf[['method']]) {
351 351
 			exportCNVs(ifiles, filename=savename, cluster=conf[['cluster.plots']], export.CNV=TRUE, export.SCE=FALSE)
352 352
 		}
353 353
 	}
354
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
354
+	stopTimedMessage(ptm)
355 355
 }
356 356
 
357 357
 #===============
... ...
@@ -359,7 +359,7 @@ if ('univariate' %in% conf[['method']]) {
359 359
 #===============
360 360
 if ('bivariate' %in% conf[['method']]) {
361 361
 
362
-	message("Running bivariate HMMs ...", appendLF=FALSE); ptm <- proc.time()
362
+	ptm <- startTimedMessage("Running bivariate HMMs ...")
363 363
 	if (!file.exists(SCEpath)) { dir.create(SCEpath) }
364 364
 
365 365
 	files <- list.files(binpath, full.names=TRUE, recursive=TRUE, pattern='.RData$')
... ...
@@ -371,24 +371,24 @@ if ('bivariate' %in% conf[['method']]) {
371 371
 				## Add SCE coordinates to model
372 372
 				reads.file <- file.path(readspath, paste0(model$ID,'.RData'))
373 373
 				model$sce <- suppressMessages( getSCEcoordinates(model, resolution=conf[['resolution']], min.segwidth=conf[['min.segwidth']], fragments=reads.file, min.reads=conf[['min.reads']]) )
374
-				message("Saving to file ",savename," ...", appendLF=FALSE); ptm <- proc.time()
374
+				ptm <- startTimedMessage("Saving to file ",savename," ...")
375 375
 				save(model, file=savename)
376
-				time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
376
+				stopTimedMessage(ptm)
377 377
 # 			} else {
378 378
 # 				model <- get(load(savename))
379 379
 # 				model$sce <- suppressMessages( getSCEcoordinates(model, resolution=conf[['resolution']], min.segwidth=conf[['min.segwidth']]) )
380
-# 				message("Saving to file ",savename," ...", appendLF=FALSE); ptm <- proc.time()
380
+# 				ptm <- startTimedMessage("Saving to file ",savename," ...")
381 381
 # 				save(model, file=savename)
382
-# 				time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
382
+# 				stopTimedMessage(ptm)
383 383
 			}
384 384
 		}, error = function(err) {
385 385
 			stop(file,'\n',err)
386 386
 		})
387 387
 	}
388
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
388
+	stopTimedMessage(ptm)
389 389
 
390 390
 	### Finding hotspots ###
391
-	message("Finding SCE hotspots ...", appendLF=FALSE); ptm <- proc.time()
391
+	ptm <- startTimedMessage("Finding SCE hotspots ...")
392 392
 	hotspots <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
393 393
 		ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE)
394 394
 		ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=TRUE)
... ...
@@ -401,7 +401,7 @@ if ('bivariate' %in% conf[['method']]) {
401 401
 		return(hotspot)
402 402
 	}
403 403
 	names(hotspots) <- patterns
404
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
404
+	stopTimedMessage(ptm)
405 405
 
406 406
 	#===================
407 407
 	### Plotting SCE ###
... ...
@@ -414,7 +414,7 @@ if ('bivariate' %in% conf[['method']]) {
414 414
 	#-----------------------
415 415
 	## Plot distributions ##
416 416
 	#-----------------------
417
-	message("Plotting distributions ...", appendLF=FALSE); ptm <- proc.time()
417
+	ptm <- startTimedMessage("Plotting distributions ...")
418 418
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
419 419
 		savename <- file.path(SCEplotpath,paste0('distributions_',sub('_$','',pattern),'.pdf'))
420 420
 		if (!file.exists(savename)) {
... ...
@@ -432,11 +432,11 @@ if ('bivariate' %in% conf[['method']]) {
432 432
 			d <- dev.off()
433 433
 		}
434 434
 	}
435
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
435
+	stopTimedMessage(ptm)
436 436
 	#------------------
437 437
 	## Plot heatmaps ##
438 438
 	#------------------
439
-	message("Plotting heatmaps ...", appendLF=FALSE); ptm <- proc.time()
439
+	ptm <- startTimedMessage("Plotting heatmaps ...")
440 440
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
441 441
 		ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE)
442 442
 		ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=TRUE)
... ...
@@ -464,11 +464,11 @@ if ('bivariate' %in% conf[['method']]) {
464 464
 			warning("Plotting chromosome heatmaps: No files for pattern ",pattern," found.")
465 465
 		}
466 466
 	}
467
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
467
+	stopTimedMessage(ptm)
468 468
 	#------------------
469 469
 	## Plot arrayCGH ##
470 470
 	#------------------
471
-	message("Making arrayCGH plots ...", appendLF=FALSE); ptm <- proc.time()
471
+	ptm <- startTimedMessage("Making arrayCGH plots ...")
472 472
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
473 473
 		savename <- file.path(SCEplotpath,paste0('arrayCGH_',sub('_$','',pattern),'.pdf'))
474 474
 		if (!file.exists(savename)) {
... ...
@@ -486,11 +486,11 @@ if ('bivariate' %in% conf[['method']]) {
486 486
 			d <- dev.off()
487 487
 		}
488 488
 	}
489
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
489
+	stopTimedMessage(ptm)
490 490
 	#--------------------
491 491
 	## Plot karyograms ##
492 492
 	#--------------------
493
-	message("Plotting karyograms ...", appendLF=FALSE); ptm <- proc.time()
493
+	ptm <- startTimedMessage("Plotting karyograms ...")
494 494
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
495 495
 		savename <- file.path(SCEplotpath,paste0('karyograms_',sub('_$','',pattern),'.pdf'))
496 496
 		if (!file.exists(savename)) {
... ...
@@ -508,11 +508,11 @@ if ('bivariate' %in% conf[['method']]) {
508 508
 			d <- dev.off()
509 509
 		}
510 510
 	}
511
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
511
+	stopTimedMessage(ptm)
512 512
 	#-------------------------
513 513
 	## Export browser files ##
514 514
 	#-------------------------
515
-	message("Exporting browser files ...", appendLF=FALSE); ptm <- proc.time()
515
+	ptm <- startTimedMessage("Exporting browser files ...")
516 516
 	if (!file.exists(SCEbrowserpath)) { dir.create(SCEbrowserpath) }
517 517
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
518 518
 		savename <- file.path(SCEbrowserpath,sub('_$','',pattern))
... ...
@@ -526,7 +526,7 @@ if ('bivariate' %in% conf[['method']]) {
526 526
 			exportGRanges(hotspots[[pattern]], filename=savename, trackname=basename(savename), score=hotspots[[pattern]]$num.events)
527 527
 		}
528 528
 	}
529
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
529
+	stopTimedMessage(ptm)
530 530
 }
531 531
 
532 532
 stopCluster(cl)
... ...
@@ -18,14 +18,15 @@ NULL
18 18
 #' @describeIn binning Bin reads in BAM format
19 19
 #' @inheritParams align2binned
20 20
 #' @inheritParams bam2GRanges
21
+#' @param bamfile A file in BAM format. Alternatively a \code{\link{GRanges}} with aligned reads.
21 22
 #' @export
22
-bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEndReads=FALSE, max.fragment.width=1000, outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
23
+bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEndReads=FALSE, max.fragment.width=1000, outputfolder.binned="binned_data", binsizes=NULL, variable.width.reference=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
23 24
 	call <- match.call()
24 25
 	underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='')
25 26
 	message("\n",call[[1]],"():")
26 27
 	message(underline)
27 28
 	ptm <- proc.time()
28
-	binned.data <- align2binned(bamfile, format="bam", ID=ID, bamindex=bamindex, pairedEndReads=pairedEndReads, max.fragment.width=max.fragment.width, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
29
+	binned.data <- align2binned(bamfile, format="bam", assembly=NULL, ID=ID, bamindex=bamindex, pairedEndReads=pairedEndReads, max.fragment.width=max.fragment.width, outputfolder.binned=outputfolder.binned, binsizes=binsizes, variable.width.reference=variable.width.reference, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
29 30
 	time <- proc.time() - ptm
30 31
 	message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s")
31 32
 	return(binned.data)
... ...
@@ -33,14 +34,16 @@ bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEn
33 34
 
34 35
 #' @describeIn binning Bin reads in BED format
35 36
 #' @inheritParams align2binned
36
-#' @param bedfile A file in BED format.
37
-bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
37
+#' @inheritParams bed2GRanges
38
+#' @param bedfile A file in BED format. Alternatively a \code{\link{GRanges}} with aligned reads.
39
+#' @export
40
+bed2binned <- function(bedfile, assembly, ID=basename(bedfile), outputfolder.binned="binned_data", binsizes=NULL, variable.width.reference=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
38 41
 	call <- match.call()
39 42
 	underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='')
40 43
 	message("\n",call[[1]],"():")
41 44
 	message(underline)
42 45
 	ptm <- proc.time()
43
-	binned.data <- align2binned(bedfile, format="bed", ID=ID, chrom.length.file=chrom.length.file, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
46
+	binned.data <- align2binned(bedfile, format="bed", assembly=assembly, ID=ID, outputfolder.binned=outputfolder.binned, binsizes=binsizes, variable.width.reference=variable.width.reference, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
44 47
 	time <- proc.time() - ptm
45 48
 	message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s")
46 49
 	return(binned.data)
... ...
@@ -50,13 +53,14 @@ bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputf
50 53
 #'
51 54
 #' Convert aligned reads in .bam or .bed format into read counts in equidistant windows.
52 55
 #'
53
-#' @param file A file with aligned reads.
56
+#' @param file A file with aligned reads. Alternatively a \code{\link{GRanges}} with aligned reads.
54 57
 #' @param format One of \code{c('bam', 'bed')}.
55 58
 #' @param ID An identifier that will be used to identify the file throughout the workflow and in plotting.
56 59
 #' @inheritParams bam2GRanges
57
-#' @param chrom.length.file A file which contains the chromosome lengths in basepairs. The first column contains the chromosome name and the second column the length (see also \code{\link{chrom.length.file}}).
60
+#' @inheritParams bed2GRanges
58 61
 #' @param outputfolder.binned Folder to which the binned data will be saved. If the specified folder does not exist, it will be created.
59 62
 #' @param binsizes An integer vector with bin sizes. If more than one value is given, output files will be produced for each bin size.
63
+#' @param variable.width.reference A BAM file that is used as reference to produce variable width bins. See \code{\link{variableWidthBins}} for details.
60 64
 #' @param reads.per.bin Approximate number of desired reads per bin. The bin size will be selected accordingly. Output files are produced for each value.
61 65
 #' @param stepsize Fraction of the binsize that the sliding window is offset at each step. Example: If \code{stepsize=0.1} and \code{binsizes=c(200000,500000)}, the actual stepsize in basepairs is 20000 and 50000, respectively.
62 66
 #' @param chromosomes If only a subset of the chromosomes should be binned, specify them here.
... ...
@@ -69,7 +73,7 @@ bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputf
69 73
 #' @param reads.overwrite Whether or not an existing file with read fragments should be overwritten.
70 74
 #' @param reads.only If \code{TRUE} only read fragments are stored and/or returned and no binning is done.
71 75
 #' @return The function produces a \link{GRanges} object with one meta data column 'reads' that contains the read count. This binned data will be either written to file (\code{save.as.RData=FALSE}) or given as return value (\code{save.as.RData=FALSE}).
72
-align2binned <- function(file, format, ID=basename(file), bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, chrom.length.file, outputfolder.binned="binned_data", binsizes=200000, reads.per.bin=NULL, stepsize=NULL, save.as.RData=FALSE, calc.complexity=TRUE, call=match.call(), reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
76
+align2binned <- function(file, format, assembly, ID=basename(file), bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, outputfolder.binned="binned_data", binsizes=200000, variable.width.reference=NULL, reads.per.bin=NULL, stepsize=NULL, save.as.RData=FALSE, calc.complexity=TRUE, call=match.call(), reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
73 77
 
74 78
 	## Check user input
75 79
 	if (reads.return==FALSE & reads.only==FALSE) {
... ...
@@ -83,32 +87,34 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos
83 87
 		dir.create(outputfolder.binned)
84 88
 	}
85 89
 
86
-	### Read in the data
87
-	message("Reading file ",basename(file)," ...", appendLF=FALSE); ptm <- proc.time()
88
-	## BED (0-based)
89
-	if (format == "bed") {
90
-		# File with chromosome lengths (1-based)
91
-		chrom.lengths.df <- read.table(chrom.length.file)
92
-		chrom.lengths <- chrom.lengths.df[,2]
93
-		names(chrom.lengths) <- chrom.lengths.df[,1]
94
-		# File with reads, determine classes first for faster import (0-based)
95
-		tab5rows <- read.table(file, nrows=5)
96
-		classes.in.bed <- sapply(tab5rows, class)
97
-		classes <- rep("NULL",length(classes.in.bed))
98
-		classes[c(1:3,6)] <- classes.in.bed[c(1:3,6)]
99
-		data <- read.table(file, colClasses=classes)
100
-		# Convert to GRanges object
101
-		data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]), strand=Rle(strand("*"), nrow(data)))	# start+1 to go from [0,x) -> [1,x]
102
-		seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))])
103
-	## BAM (1-based)
104
-	} else if (format == "bam") {
105
-		if (calc.complexity || !remove.duplicate.reads) {
106
-			data <- suppressMessages( bam2GRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=FALSE, min.mapq=min.mapq, max.fragment.width=max.fragment.width) )
107
-		} else {
108
-			data <- suppressMessages( bam2GRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=TRUE, min.mapq=min.mapq, max.fragment.width=max.fragment.width) )
90
+	if (is.character(file)) {
91
+		### Read in the data
92
+		if (format == "bed") {
93
+			## BED (0-based)
94
+			if (calc.complexity || !remove.duplicate.reads) {
95
+				data <- bed2GRanges(file, assembly=assembly, chromosomes=chromosomes, remove.duplicate.reads=FALSE, min.mapq=min.mapq, max.fragment.width=max.fragment.width)
96
+			} else {
97
+				data <- bed2GRanges(file, assembly=assembly, chromosomes=chromosomes, remove.duplicate.reads=TRUE, min.mapq=min.mapq, max.fragment.width=max.fragment.width)
98
+			}
99
+		} else if (format == "bam") {
100
+			## BAM (1-based)
101
+			if (calc.complexity || !remove.duplicate.reads) {
102
+				data <- bam2GRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=FALSE, min.mapq=min.mapq, max.fragment.width=max.fragment.width)
103
+			} else {
104
+				data <- bam2GRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=TRUE, min.mapq=min.mapq, max.fragment.width=max.fragment.width)
105
+			}
106
+		}
107
+	} else if (is(file, 'GRanges')) {
108
+		data <- file
109
+		err <- tryCatch({
110
+			!is.character(ID)
111
+		}, error = function(err) {
112
+			TRUE
113
+		})
114
+		if (err) {
115
+			ID <- 'GRanges'
109 116
 		}
110 117
 	}
111
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
112 118
 
113 119
 	## Select chromosomes to bin
114 120
 	if (is.null(chromosomes)) {
... ...
@@ -119,16 +125,18 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos
119 125
 	### Complexity estimation ###
120 126
 	complexity <- NULL
121 127
 	if (calc.complexity) {
122
-		message("  calculating complexity ...", appendLF=FALSE); ptm <- proc.time()
128
+		ptm <- startTimedMessage("Calculating complexity ...")
123 129
 		complexity <- suppressMessages( estimateComplexity(data) )
124
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
130
+		stopTimedMessage(ptm)
125 131
 	}
126 132
 	if (remove.duplicate.reads) {
133
+		ptm <- startTimedMessage("Removing duplicate reads ...")
127 134
 		sp <- start(data)[as.logical(strand(data)=='+')]
128 135
 		sp1 <- c(sp[length(sp)], sp[-length(sp)])
129 136
 		sm <- start(data)[as.logical(strand(data)=='-')]
130 137
 		sm1 <- c(sm[length(sm)], sm[-length(sm)])
131 138
 		data <- c(data[strand(data)=='+'][sp!=sp1], data[strand(data)=='-'][sm!=sm1])
139
+		stopTimedMessage(ptm)
132 140
 	}
133 141
 
134 142
 	### Return fragments if desired ###
... ...
@@ -136,7 +144,9 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos
136 144
 		if (!file.exists(outputfolder.reads)) { dir.create(outputfolder.reads) }
137 145
 		filename <- file.path(outputfolder.reads,paste0(ID,'.RData'))
138 146
 		if (reads.overwrite | !file.exists(filename)) {
147
+			ptm <- startTimedMessage(paste0("Saving fragments to ", filename, " ..."))
139 148
 			save(data, file=filename)
149
+			stopTimedMessage(ptm)
140 150
 		}
141 151
 	}
142 152
 	if (reads.return) {
... ...
@@ -147,20 +157,22 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos
147 157
 	}
148 158
 
149 159
 	### Coverage and percentage of genome covered ###
160
+	ptm <- startTimedMessage("Calculating coverage ...")
150 161
 	genome.length <- sum(as.numeric(seqlengths(data)))
151 162
 	data.strand <- data
152 163
 	strand(data.strand) <- '*'
153 164
 	coverage <- sum(as.numeric(width(data.strand))) / genome.length
154 165
 	genome.covered <- sum(as.numeric(width(reduce(data.strand)))) / genome.length
155 166
 	## Per chromosome
156
-	coverage.per.chrom <- vector()
157
-	genome.covered.per.chrom <- vector()
167
+	coverage.per.chrom <- numeric(length=length(chroms2use))
168
+	genome.covered.per.chrom <- numeric(length=length(chroms2use))
158 169
 	for (chr in chroms2use) {
159 170
 		data.strand.chr <- data.strand[seqnames(data.strand)==chr]
160 171
 		coverage.per.chrom[chr] <- sum(as.numeric(width(data.strand.chr))) / seqlengths(data.strand)[chr]
161 172
 		genome.covered.per.chrom[chr] <- sum(as.numeric(width(reduce(data.strand.chr)))) / seqlengths(data.strand)[chr]
162 173
 	}
163 174
 	coverage <- list(coverage=coverage, genome.covered=genome.covered, coverage.per.chrom=coverage.per.chrom, genome.covered.per.chrom=genome.covered.per.chrom)
175
+	stopTimedMessage(ptm)
164 176
 
165 177
 
166 178
 	## Pad binsizes and reads.per.bin with each others value
... ...
@@ -171,56 +183,67 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos
171 183
 	reads.per.bin <- c(reads.per.bin.add, reads.per.bin)
172 184
 	length.binsizes <- length(binsizes)
173 185
 
186
+	### Variable width bins ###
187
+	if (!is.null(variable.width.reference)) {
188
+		message("Making variable width bins:")
189
+		bins.list <- variableWidthBins(variable.width.reference, bamindex=variable.width.reference, binsizes=binsizes, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, max.fragment.width=max.fragment.width)
190
+		message("Finished making variable width bins.")
191
+	}
174 192
 	### Loop over all binsizes ###
175 193
 	data.plus <- data[strand(data)=='+']
176 194
 	data.minus <- data[strand(data)=='-']
177 195
 	skipped.chroms <- character()
196
+	binned.data.list <- list()
178 197
 	for (ibinsize in 1:length.binsizes) {
179 198
 		binsize <- binsizes[ibinsize]
180 199
 		readsperbin <- reads.per.bin[ibinsize]
181 200
 		message("Binning into bin size ",binsize," with on average ",readsperbin," reads per bin")
182 201
 
183
-		### Loop over chromosomes ###
184
-		message("  binning genome ...", appendLF=FALSE); ptm <- proc.time()
185
-		binned.data <- GenomicRanges::GRangesList()
186
-		for (chromosome in chroms2use) {
187
-
188
-			## Check last incomplete bin
189
-			incomplete.bin <- seqlengths(data)[chromosome] %% binsize > 0
190
-			if (incomplete.bin) {
191
-				numbin <- floor(seqlengths(data)[chromosome]/binsize)	# floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end
192
-			} else {
193
-				numbin <- seqlengths(data)[chromosome]/binsize
202
+		if (is.null(variable.width.reference)) {
203
+			### Fixed width bins ###
204
+			ptm <- startTimedMessage("Making fixed-width bins ...")
205
+			binned.data <- GenomicRanges::GRangesList()
206
+			## Loop over chromosomes
207
+			for (chromosome in chroms2use) {
208
+				## Check last incomplete bin
209
+				incomplete.bin <- seqlengths(data)[chromosome] %% binsize > 0
210
+				if (incomplete.bin) {
211
+					numbin <- floor(seqlengths(data)[chromosome]/binsize)	# floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end
212
+				} else {
213
+					numbin <- seqlengths(data)[chromosome]/binsize
214
+				}
215
+				if (numbin == 0) {
216
+					skipped.chroms[chromosome] <- chromosome
217
+					next
218
+				}
219
+				## Initialize vectors
220
+				chroms <- rep(chromosome,numbin)
221
+				reads <- rep(0,numbin)
222
+				start <- seq(from=1, by=binsize, length.out=numbin)
223
+				end <- seq(from=binsize, by=binsize, length.out=numbin)
224
+	# 			end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired
225
+
226
+				## Create binned chromosome as GRanges object
227
+				i.binned.data <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbin),
228
+								ranges = IRanges(start=start, end=end),
229
+								strand = Rle(strand("*"), numbin)
230
+								)
231
+				seqlengths(i.binned.data) <- seqlengths(data)[chromosome]
232
+
233
+				suppressWarnings(
234
+					binned.data[[chromosome]] <- i.binned.data
235
+				)
236
+
237
+			} ## end loop chromosomes
238
+			if (length(skipped.chroms)>0) {
239
+				warning("Chromosomes ", paste0(skipped.chroms, collapse=', '), " are smaller than binsize. Skipped.")
194 240
 			}
195
-			if (numbin == 0) {
196
-				skipped.chroms[chromosome] <- chromosome
197
-				next
198
-			}
199
-			## Initialize vectors
200
-			chroms <- rep(chromosome,numbin)
201
-			reads <- rep(0,numbin)
202
-			start <- seq(from=1, by=binsize, length.out=numbin)
203
-			end <- seq(from=binsize, by=binsize, length.out=numbin)
204
-# 			end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired
205
-
206
-			## Create binned chromosome as GRanges object
207
-			i.binned.data <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbin),
208
-							ranges = IRanges(start=start, end=end),
209
-							strand = Rle(strand("*"), numbin)
210
-							)
211
-			seqlengths(i.binned.data) <- seqlengths(data)[chromosome]
212
-
213
-			suppressWarnings(
214
-				binned.data[[chromosome]] <- i.binned.data
215
-			)
216
-
217
-		} ### end loop chromosomes ###
218
-		if (length(skipped.chroms)>0) {
219
-			warning("Chromosomes ", paste0(skipped.chroms, collapse=', '), " are smaller than binsize. Skipped.")
241
+			binned.data <- unlist(binned.data)
242
+			names(binned.data) <- NULL
243
+			stopTimedMessage(ptm)
244
+		} else {
245
+			binned.data <- bins.list[[ibinsize]]
220 246
 		}
221
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
222
-		binned.data <- unlist(binned.data)
223
-		names(binned.data) <- NULL
224 247
 
225 248
 		### Loop over offsets ###
226 249
 		countmatrices <- list()
... ...
@@ -231,7 +254,7 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos
231 254
 		}
232 255
 		for (ioff in offsets) {
233 256
 			## Count overlaps
234
-			message(paste0("  counting overlaps for offset ",ioff," ..."), appendLF=FALSE); ptm <- proc.time()
257
+			ptm <- startTimedMessage(paste0("Counting overlaps for offset ",ioff," ..."))
235 258
 			binned.data.shift <- suppressWarnings( shift(binned.data, shift=ioff) )
236 259
 			if (format=="bam" | format=="bed") {
237 260
 				mcounts <- suppressWarnings( GenomicRanges::countOverlaps(binned.data.shift, data.minus) )
... ...
@@ -241,7 +264,7 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos
241 264
 			countmatrix <- matrix(c(counts,mcounts,pcounts), ncol=3)
242 265
 			colnames(countmatrix) <- c('counts','mcounts','pcounts')
243 266
 			countmatrices[[as.character(ioff)]] <- countmatrix
244
-			time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
267
+			stopTimedMessage(ptm)
245 268
 		}
246 269
 		mcols(binned.data) <- as(countmatrices[['0']],'DataFrame')
247 270
 		attr(binned.data,'offset.counts') <- countmatrices
... ...
@@ -263,17 +286,21 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, chromos
263 286
 		if (save.as.RData==TRUE) {
264 287
 			# Save to file
265 288
 			filename <- paste0(ID,"_binsize_",format(binsize, scientific=TRUE, trim=TRUE),"_reads.per.bin_",readsperbin,"_.RData")
266
-			message("Saving to file ...", appendLF=FALSE); ptm <- proc.time()
289
+			ptm <- startTimedMessage("Saving to file ...")
267 290
 # 			attr(binned.data, 'call') <- call # do not store along with GRanges because it inflates disk usage
268 291
 			save(binned.data, file=file.path(outputfolder.binned,filename) )
269
-			time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
292
+			stopTimedMessage(ptm)
270 293
 		} else {
271 294
 # 			attr(binned.data, 'call') <- call
272
-			return(binned.data)
295
+			binned.data.list[[as.character(binsize)]] <- binned.data
273 296
 		}
274 297
 
275 298
 	} ### end loop binsizes ###
276 299
 
300
+	if (!save.as.RData) {
301
+		return(binned.data.list)
302
+	}
303
+
277 304
 
278 305
 }
279 306
 
... ...
@@ -376,16 +403,17 @@ estimateComplexity <- function(reads) {
376 403
 #'
377 404
 #' Import aligned reads from a BAM file into a \code{\link{GRanges}} object.
378 405
 #'
379
-#' @param bamfile A file in BAM format.
406
+#' @param bamfile A sorted BAM file.
380 407
 #' @param bamindex BAM index file. Can be specified without the .bai ending. If the index file does not exist it will be created and a warning is issued.
381 408
 #' @param chromosomes If only a subset of the chromosomes should be imported, specify them here.
382 409
 #' @param pairedEndReads Set to \code{TRUE} if you have paired-end reads in your file.
383 410
 #' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed.
384 411
 #' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=NULL} to keep all reads.
385 412
 #' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments due to mapping errors of paired end reads.
413
+#' @param what A character vector of fields that are returned. Type \code{\link[Rsamtools]{scanBamWhat}} to see what is available.
386 414
 #' @importFrom Rsamtools indexBam scanBamHeader ScanBamParam scanBamFlag
387 415
 #' @importFrom GenomicAlignments readGAlignmentPairsFromBam readGAlignmentsFromBam first
388
-bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndReads=FALSE, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000) {
416
+bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndReads=FALSE, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000, what='mapq') {
389 417
 
390 418
 	## Check if bamindex exists
391 419
 	bamindex.raw <- sub('\\.bai$', '', bamindex)
... ...
@@ -412,21 +440,25 @@ bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndRe
412 440
 		diffs <- paste0(diff, collapse=', ')
413 441
 		warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.'))
414 442
 	}
443
+
415 444
 	## Import the file into GRanges
445
+	ptm <- startTimedMessage("Reading file ",basename(bamfile)," ...")
416 446
 	gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use), ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use]))
417 447
 	if (!remove.duplicate.reads) {
418 448
 		if (pairedEndReads) {
419
-			data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq'))
449
+			data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what))
420 450
 		} else {
421
-			data.raw <- GenomicAlignments::readGAlignmentsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq'))
451
+			data.raw <- GenomicAlignments::readGAlignmentsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what))
422 452
 		}
423 453
 	} else {
424 454
 		if (pairedEndReads) {
425
-			data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq', flag=scanBamFlag(isDuplicate=FALSE)))
455
+			data.raw <- GenomicAlignments::readGAlignmentPairsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what, flag=scanBamFlag(isDuplicate=FALSE)))
426 456
 		} else {
427
-			data.raw <- GenomicAlignments::readGAlignmentsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what='mapq', flag=scanBamFlag(isDuplicate=FALSE)))
457
+			data.raw <- GenomicAlignments::readGAlignmentsFromBam(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what, flag=scanBamFlag(isDuplicate=FALSE)))
428 458
 		}
429 459
 	}
460
+	stopTimedMessage(ptm)
461
+
430 462
 	if (length(data.raw) == 0) {
431 463
 		if (pairedEndReads) {
432 464
 			stop(paste0("No reads imported. Does your file really contain paired end reads? Try with 'pairedEndReads=FALSE'"))
... ...
@@ -435,11 +467,11 @@ bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndRe
435 467
 	}
436 468
 	## Filter by mapping quality
437 469
 	if (pairedEndReads) {
438
-		message("Converting to GRanges ...", appendLF=FALSE); ptm <- proc.time()
470
+		ptm <- startTimedMessage("Converting to GRanges ...")
439 471
 		data <- as(data.raw, 'GRanges') # treat as one fragment
440
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
472
+		stopTimedMessage(ptm)
441 473
 
442
-		message("Filtering reads ...", appendLF=FALSE); ptm <- proc.time()
474
+		ptm <- startTimedMessage("Filtering reads ...")
443 475
 		if (!is.null(min.mapq)) {
444 476
 			mapq.first <- mcols(GenomicAlignments::first(data.raw))$mapq
445 477
 			mapq.last <- mcols(GenomicAlignments::last(data.raw))$mapq
... ...
@@ -448,16 +480,16 @@ bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndRe
448 480
 				warning(paste0(bamfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
449 481
 			}
450 482
 			data <- data[which(mapq.mask)]
451
-			# Filter out too long fragments
452
-			data <- data[width(data)<=max.fragment.width]
453 483
 		}
454
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
484
+		# Filter out too long fragments
485
+		data <- data[width(data)<=max.fragment.width]
486
+		stopTimedMessage(ptm)
455 487
 	} else {
456
-		message("Converting to GRanges ...", appendLF=FALSE); ptm <- proc.time()
488
+		ptm <- startTimedMessage("Converting to GRanges ...")
457 489
 		data <- as(data.raw, 'GRanges')
458
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
490
+		stopTimedMessage(ptm)
459 491
 
460
-		message("Filtering reads ...", appendLF=FALSE); ptm <- proc.time()
492
+		ptm <- startTimedMessage("Filtering reads ...")
461 493
 		if (!is.null(min.mapq)) {
462 494
 			if (any(is.na(mcols(data)$mapq))) {
463 495
 				warning(paste0(bamfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
... ...
@@ -465,8 +497,87 @@ bam2GRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndRe
465 497
 			}
466 498
 			data <- data[mcols(data)$mapq >= min.mapq]
467 499
 		}
468
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
500
+		# Filter out too long fragments
501
+		data <- data[width(data)<=max.fragment.width]
502
+		stopTimedMessage(ptm)
503
+	}
504
+
505
+	return(data)
506
+
507
+}
508
+
509
+
510
+#' Import BED file into GRanges
511
+#'
512
+#' Import aligned reads from a BED file into a \code{\link{GRanges}} object.
513
+#'
514
+#' @param bedfile A file with aligned reads in BED format. The columns have to be c('chromosome','start','end','description','mapq','strand').
515
+#' @param assembly Please see \code{\link[GenomeInfoDb]{fetchExtendedChromInfoFromUCSC}} for available assemblies.
516
+#' @param chromosomes If only a subset of the chromosomes should be imported, specify them here.
517
+#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed.
518
+#' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=NULL} to keep all reads.
519
+#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments.
520
+#' @importFrom GenomeInfoDb fetchExtendedChromInfoFromUCSC
521
+bed2GRanges <- function(bedfile, assembly, chromosomes=NULL, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000) {
522
+
523
+	# File with reads, specify classes for faster import (0-based)
524
+	ptm <- startTimedMessage("Reading file ",basename(bedfile)," ...")
525
+	classes <- c('character','numeric','numeric','NULL','integer','character')
526
+	data.raw <- read.table(bedfile, colClasses=classes)
527
+	# Convert to GRanges object
528
+	data <- GenomicRanges::GRanges(seqnames=Rle(data.raw[,1]), ranges=IRanges(start=data.raw[,2]+1, end=data.raw[,3]), strand=data.raw[,5])	# start+1 to go from [0,x) -> [1,x]
529
+	mcols(data)$mapq <- data.raw[,4]
530
+	remove(data.raw)
531
+	chroms.in.data <- seqlevels(data)
532
+	stopTimedMessage(ptm)
533
+	# Get chromosome lengths
534
+	ptm <- startTimedMessage("Fetching chromosome lengths from UCSC ...")
535
+	df <- GenomeInfoDb::fetchExtendedChromInfoFromUCSC(assembly)
536
+	chrom.lengths <- df$UCSC_seqlengths
537
+	if (grepl('^chr',seqlevels(data)[1])) {
538
+		names(chrom.lengths) <- df$UCSC_seqlevels
539
+	} else {
540
+		names(chrom.lengths) <- df$NCBI_seqlevels
541
+	}
542
+	seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))])
543
+	stopTimedMessage(ptm)
544
+
545
+	## Issue warnings and stuff for non-existing chromosomes
546
+	chroms.in.data <- names(chrom.lengths)
547
+	if (is.null(chromosomes)) {
548
+		chromosomes <- chroms.in.data
549
+	}
550
+	chroms2use <- intersect(chromosomes, chroms.in.data)
551
+	if (length(chroms2use)==0) {
552
+		chrstring <- paste0(chromosomes, collapse=', ')
553
+		stop('The specified chromosomes ', chrstring, ' do not exist in the data.')
554
+	}
555
+	## Issue warning for non-existent chromosomes
556
+	diff <- setdiff(chromosomes, chroms.in.data)
557
+	if (length(diff)>0) {
558
+		diffs <- paste0(diff, collapse=', ')
559
+		warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.'))
560
+	}
561
+
562
+	## Select only desired chromosomes
563
+	data <- data[seqnames(data) %in% chroms2use]
564
+	if (length(data) == 0) {
565
+		stop(paste0('No reads imported!'))
566
+	}
567
+
568
+	## Filter by mapping quality
569
+	ptm <- startTimedMessage("Filtering reads ...")
570
+	if (!is.null(min.mapq)) {
571
+		if (any(is.na(mcols(data)$mapq))) {
572
+			warning(paste0(bedfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
573
+			mcols(data)$mapq[is.na(mcols(data)$mapq)] <- -1
574
+		}
575
+		data <- data[mcols(data)$mapq >= min.mapq]
469 576
 	}
577
+	# Filter out too long fragments
578
+	data <- data[width(data)<=max.fragment.width]
579
+	stopTimedMessage(ptm)
580
+
470 581
 	return(data)
471 582
 
472 583
 }
... ...
@@ -34,7 +34,7 @@ correctGC <- function(binned.data.list, GC.BSgenome, same.GC.content=FALSE) {
34 34
 
35 35
 		## Calculate GC content per bin
36 36
 		if (same.GC.content & !same.GC.calculated | !same.GC.content) {
37
-			message("Calculating GC content per bin ...", appendLF=FALSE); ptm <- proc.time()
37
+			ptm <- startTimedMessage("Calculating GC content per bin ...")
38 38
 			GC.content <- list()
39 39
 			for (chrom in seqlevels(binned.data)) {
40 40
 				if (!grepl('^chr',chrom)) {
... ...
@@ -57,12 +57,12 @@ correctGC <- function(binned.data.list, GC.BSgenome, same.GC.content=FALSE) {
57 57
 			}
58 58
 			GC.content <- unlist(GC.content)
59 59
 			same.GC.calculated <- TRUE
60
-			time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
60
+			stopTimedMessage(ptm)
61 61
 		}
62 62
 		binned.data$GC <- GC.content
63 63
 
64 64
 		### GC correction ###
65
-		message("GC correction ...", appendLF=FALSE); ptm <- proc.time()
65
+		ptm <- startTimedMessage("GC correction ...")
66 66
 		counts <- binned.data$counts
67 67
 		mcounts <- binned.data$mcounts
68 68
 		pcounts <- binned.data$pcounts
... ...
@@ -107,7 +107,7 @@ correctGC <- function(binned.data.list, GC.BSgenome, same.GC.content=FALSE) {
107 107
 		# Produce fit to check
108 108
 		ggplt <- ggplot(df) + geom_point(aes_string(x='x', y='y', size='weight')) + geom_line(aes_string(x='x', y='y'), data=data.frame(x=gc.categories[intervals], y=fitted.correction.factors)) + theme_bw() + ggtitle('GC correction') + xlab('GC content') + ylab('correction factor')
109 109
 		attr(binned.data, 'GC.correction.ggplt') <- ggplt
110
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
110
+		stopTimedMessage(ptm)
111 111
 
112 112
 		### Quality measures ###
113 113
 		## Spikyness
114 114
new file mode 100644
... ...
@@ -0,0 +1,45 @@
1
+
2
+# ============================================================================
3
+# Functions for a Negative Binomial to transform (mean,variance)<->(size,prob)
4
+# ============================================================================
5
+dnbinom.size <- function(mean, variance) {
6
+	return(mean^2 / (variance - mean))
7
+}
8
+
9
+dnbinom.prob <- function(mean, variance) {
10
+	return(mean/variance)
11
+}
12
+
13
+dnbinom.mean <- function(size, prob) {
14
+	return(size/prob - size)
15
+}
16
+
17
+dnbinom.variance <- function(size, prob) {
18
+	return( (size - prob*size) / prob^2 )
19
+}
20
+
21
+dgeom.mean <- function(prob) {
22
+	return( (1-prob)/prob )
23
+}
24
+
25
+dgeom.variance <- function(prob) {
26
+	return( (1-prob)/prob^2 )
27
+}
28
+
29
+dbinom.size <- function(mean, variance) {
30
+	return( mean^2/(mean-variance) )
31
+}
32
+
33
+dbinom.prob <- function(mean, variance) {
34
+	return( (mean-variance)/mean )
35
+}
36
+
37
+dbinom.mean <- function(size, prob) {
38
+	return( size*prob )
39
+}
40
+
41
+dbinom.variance <- function(size, prob) {
42
+	return( size*prob * (1-prob) )
43
+}
44
+
45
+
... ...
@@ -581,7 +581,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
581 581
 	message("Preparing bivariate HMM\n")
582 582
 
583 583
 	## Pre-compute z-values for each number of counts
584
-	message("Computing pre z-matrix...", appendLF=FALSE); ptm <- proc.time()
584
+	ptm <- startTimedMessage("Computing pre z-matrix...")
585 585
 	z.per.count <- array(NA, dim=c(maxcounts+1, num.models, num.uni.states), dimnames=list(counts=0:maxcounts, strand=names(distributions), state=uni.states))
586 586
 	xcounts <- 0:maxcounts
587 587
 	for (istrand in 1:num.models) {
... ...
@@ -602,10 +602,10 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
602 602
 			z.per.count[ , istrand, istate] <- qnorm_u
603 603
 		}
604 604
 	}
605
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
605
+	stopTimedMessage(ptm)
606 606
 
607 607
 	## Compute the z matrix
608
-	message("Transfering values into z-matrix...", appendLF=FALSE); ptm <- proc.time()
608
+	ptm <- startTimedMessage("Transfering values into z-matrix...")
609 609
 	z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(distributions), state=uni.states))
610 610
 	for (istrand in 1:num.models) {
611 611
 		for (istate in 1:num.uni.states) {
... ...
@@ -613,10 +613,10 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
613 613
 		}
614 614
 	}
615 615
 	remove(z.per.count)
616
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
616
+	stopTimedMessage(ptm)
617 617
 
618 618
 	## Calculate correlation matrix
619
-	message("Computing inverse of correlation matrix...", appendLF=FALSE); ptm <- proc.time()
619
+	ptm <- startTimedMessage("Computing inverse of correlation matrix...")
620 620
 	correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(distributions), strand=names(distributions), comb.state=comb.states))
621 621
 	correlationMatrixInverse <- correlationMatrix
622 622
 	determinant <- rep(0, num.comb.states)
... ...
@@ -656,7 +656,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
656 656
 			usestateTF[comb.state] <- TRUE
657 657
 		}
658 658
 	}
659
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
659
+	stopTimedMessage(ptm)
660 660
 
661 661
 	# Use only states with valid correlationMatrixInverse (all states are valid in this version)
662 662
 	correlationMatrix = correlationMatrix[,,usestateTF]
... ...
@@ -667,7 +667,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
667 667
 	num.comb.states <- length(comb.states)
668 668
 
669 669
 	### Calculate multivariate densities for each state ###
670
-	message("Calculating multivariate densities...", appendLF=FALSE); ptm <- proc.time()
670
+	ptm <- startTimedMessage("Calculating multivariate densities...")
671 671
 	densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states))
672 672
 	for (comb.state in comb.states) {
673 673
 		istate <- which(comb.state==comb.states)
... ...
@@ -706,7 +706,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
706 706
 			densities[icheck,] <- densities[icheck-1,]
707 707
 		}
708 708
 	}
709
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
709
+	stopTimedMessage(ptm)
710 710
 		
711 711
 	### Define cleanup behaviour ###
712 712
 	on.exit(.C("R_multivariate_cleanup", as.integer(num.comb.states)))
... ...
@@ -760,7 +760,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
760 760
 			mcols(result$bins)[names[i1]] <- factor(matrix.states[,i1], levels=uni.states)
761 761
 		}
762 762
 	## Segmentation
763
-		message("Making segmentation ...", appendLF=FALSE); ptm <- proc.time()
763
+		ptm <- startTimedMessage("Making segmentation ...")
764 764
 		gr <- result$bins
765 765
 		red.gr.list <- GRangesList()
766 766
 		for (state in comb.states) {
... ...
@@ -773,7 +773,7 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
773 773
 		red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list))
774 774
 		result$segments <- red.gr
775 775
 		seqlengths(result$segments) <- seqlengths(result$bins)
776
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
776
+		stopTimedMessage(ptm)
777 777
 	## CNV state for both strands combined
778 778
 		# Bins
779 779
 		state <- multiplicity[result$bins$mstate] + multiplicity[result$bins$pstate]
... ...
@@ -27,7 +27,7 @@ getSegments <- function(hmms, cluster=TRUE, classes=NULL) {
27 27
 
28 28
 	## Clustering
29 29
 	if (cluster) {
30
-		message("Making consensus template ...", appendLF=FALSE); ptm <- proc.time()
30
+		ptm <- startTimedMessage("Making consensus template ...")
31 31
 		consensus <- disjoin(unlist(grlred))
32 32
 		constates <- matrix(NA, ncol=length(grlred), nrow=length(consensus))
33 33
 		for (i1 in 1:length(grlred)) {
... ...
@@ -38,15 +38,15 @@ getSegments <- function(hmms, cluster=TRUE, classes=NULL) {
38 38
 		}
39 39
 		meanstates <- apply(constates, 1, mean, na.rm=TRUE)
40 40
 		mcols(consensus)$meanstate <- meanstates
41
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
41
+		stopTimedMessage(ptm)
42 42
 
43 43
 		# Distance measure
44 44
 		# Use covariance instead of correlation to avoid NaNs for which the hclust fails with error
45
-		message("clustering ...", appendLF=FALSE); ptm <- proc.time()
45
+		ptm <- startTimedMessage("clustering ...")
46 46
 		constates[is.na(constates)] <- 0
47 47
 		wcor <- cov.wt(constates, wt=as.numeric(width(consensus)))
48 48
 		dist <- as.dist(max(wcor$cov)-wcor$cov)
49
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
49
+		stopTimedMessage(ptm)
50 50
 		# Dendrogram
51 51
 		message("reordering ...")
52 52
 		hc <- hclust(dist)
... ...
@@ -14,47 +14,3 @@ class.multivariate.hmm <- "aneuMultiHMM"
14 14
 class.bivariate.hmm <- "aneuBiHMM"
15 15
 class.hmm.list <- "aneufinder.hmm.list"
16 16
 
17
-# ============================================================================
18
-# Functions for a Negative Binomial to transform (mean,variance)<->(size,prob)
19
-# ============================================================================
20
-dnbinom.size <- function(mean, variance) {
21
-	return(mean^2 / (variance - mean))
22
-}
23
-
24
-dnbinom.prob <- function(mean, variance) {
25
-	return(mean/variance)
26
-}
27
-
28
-dnbinom.mean <- function(size, prob) {
29
-	return(size/prob - size)
30
-}
31
-
32
-dnbinom.variance <- function(size, prob) {
33
-	return( (size - prob*size) / prob^2 )
34
-}
35
-
36
-dgeom.mean <- function(prob) {
37
-	return( (1-prob)/prob )
38
-}
39
-
40
-dgeom.variance <- function(prob) {
41
-	return( (1-prob)/prob^2 )
42
-}
43
-
44
-dbinom.size <- function(mean, variance) {
45
-	return( mean^2/(mean-variance) )
46
-}
47
-
48
-dbinom.prob <- function(mean, variance) {
49
-	return( (mean-variance)/mean )
50
-}
51
-
52
-dbinom.mean <- function(size, prob) {
53
-	return( size*prob )
54
-}
55
-
56
-dbinom.variance <- function(size, prob) {
57
-	return( size*prob * (1-prob) )
58
-}
59
-
60
-
61 17
similarity index 87%
62 18
rename from R/convert2GRanges.R
63 19
rename to R/importBed.R
... ...
@@ -2,14 +2,14 @@
2 2
 
3 3
 #' Read bed-file into GRanges
4 4
 #'
5
-#' This is a simple convenience function to read a (compressed) bed-file into a \code{\link{GRanges}} object. The bed-file is expected to have the following fields: \code{chrom, chromStart, chromEnd, name, score, strand}.
5
+#' This is a simple convenience function to read a (compressed) bed-file into a \code{\link{GRanges}} object. The bed-file is expected to have the following fields: \code{chromosome, start, end, name, score, strand}.
6 6
 #'
7 7
 #' @param bedfile Filename of the bed or bed.gz file.
8 8
 #' @param skip Number of lines to skip at the beginning.
9 9
 #' @return A \code{\link{GRanges}} object with the contents of the bed-file.
10 10
 #' @author Aaron Taudt
11 11
 #' @export
12
-bed2GRanges <- function(bedfile, skip=0) {
12
+importBed <- function(bedfile, skip=0) {
13 13
 
14 14
 	# File with reads, determine classes first for faster import (0-based)
15 15
 	tab5rows <- read.table(bedfile, nrows=5, skip=skip)
... ...
@@ -21,7 +21,7 @@ karyotypeMeasures <- function(hmms, normalChromosomeNumbers=NULL) {
21 21
 	hmms <- loadHmmsFromFiles(hmms)
22 22
 
23 23
 	## If all binsizes are the same the consensus template can be chosen equal to the bins
24
-	message("Making consensus template ...", appendLF=FALSE); ptm <- proc.time()
24
+	ptm <- startTimedMessage("Making consensus template ...")
25 25
 	binsizes <- unlist(lapply(hmms, function(x) { width(x$bins)[1] }))
26 26
 	if (all(binsizes==binsizes[1])) {
27 27
 		consensus <- hmms[[1]]$bins
... ...
@@ -53,7 +53,7 @@ karyotypeMeasures <- function(hmms, normalChromosomeNumbers=NULL) {
53 53
 	}
54 54
 	meanstates <- apply(constates, 1, mean, na.rm=TRUE)
55 55
 	mcols(consensus)$meanstate <- meanstates
56
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
56
+	stopTimedMessage(ptm)
57 57
 	
58 58
 
59 59
 	### Karyotype measures ###
... ...
@@ -14,7 +14,7 @@ loadHmmsFromFiles <- function(hmms, strict=FALSE) {
14 14
 	if (is.hmm(hmms) | is.bihmm(hmms)) {
15 15
 		return(list(hmms))
16 16
 	} else if (is.character(hmms)) {
17
-		message("Loading univariate HMMs from files ...", appendLF=FALSE); ptm <- proc.time()
17
+		ptm <- startTimedMessage("Loading univariate HMMs from files ...")
18 18
 		mlist <- list()
19 19
 		for (modelfile in hmms) {
20 20
 			tC <- tryCatch({
... ...
@@ -24,7 +24,7 @@ loadHmmsFromFiles <- function(hmms, strict=FALSE) {
24 24
 			})
25 25
 			if (!is.hmm(mlist[[modelfile]]) & !is.bihmm(mlist[[modelfile]])) {
26 26
 				if (strict) {
27
-					time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
27
+					stopTimedMessage(ptm)
28 28
 					stop("File ",modelfile," does not contain an ",class.univariate.hmm," object.")
29 29
 				} else {
30 30
 					class(mlist[[modelfile]]) <- class.univariate.hmm
... ...
@@ -32,7 +32,7 @@ loadHmmsFromFiles <- function(hmms, strict=FALSE) {
32 32
 				}
33 33
 			}
34 34
 		}
35
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
35
+		stopTimedMessage(ptm)
36 36
 		return(mlist)
37 37
 	} else if (is.list(hmms)) {
38 38
 		index <- which(unlist(lapply(hmms, function(hmm) { !is.hmm(hmm) & !is.bihmm(hmm) })))
... ...
@@ -80,7 +80,7 @@ loadGRangesFromFiles <- function(files) {
80 80
 	if (class(gr.list)=='GRanges') {
81 81
 		return(list(gr.list))
82 82
 	} else if (is.character(gr.list)) {
83
-		message("Loading GRanges from files ...", appendLF=FALSE); ptm <- proc.time()
83
+		ptm <- startTimedMessage("Loading GRanges from files ...")
84 84
 		mlist <- list()
85 85
 		for (modelfile in gr.list) {
86 86
 			tC <- tryCatch({
... ...
@@ -89,11 +89,11 @@ loadGRangesFromFiles <- function(files) {
89 89
 				stop(modelfile,'\n',err)
90 90
 			})
91 91
 			if (class(mlist[[modelfile]])!='GRanges') {
92
-				time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
92
+				stopTimedMessage(ptm)
93 93
 				stop("File ",modelfile," does not contain a GRanges object.")
94 94
 			}
95 95
 		}
96
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
96
+		stopTimedMessage(ptm)
97 97
 		return(mlist)
98 98
 	} else if (is.list(gr.list)) {
99 99
 		index <- which(unlist(lapply(gr.list, function(hmm) { class(hmm)!='GRanges' })))
... ...
@@ -636,7 +636,7 @@ heatmapAneuploidies <- function(hmms, ylabels=NULL, cluster=TRUE, as.data.frame=
636 636
 	}
637 637
 	
638 638
 	## Find the most frequent state (mfs) for each chromosome and sample
639
-	message("finding most frequent state for each sample and chromosome ...", appendLF=FALSE); ptm <- proc.time()
639
+	ptm <- startTimedMessage("finding most frequent state for each sample and chromosome ...")
640 640
 	grl.per.chrom <- lapply(grlred, function(x) { split(x, seqnames(x)) })
641 641
 	mfs.samples <- list()
642 642
 	for (i1 in 1:length(grlred)) {
... ...
@@ -651,7 +651,7 @@ heatmapAneuploidies <- function(hmms, ylabels=NULL, cluster=TRUE, as.data.frame=
651 651
 		attr(mfs.samples[[names(grlred)[i1]]], "varname") <- 'chromosome'
652 652
 	}
653 653
 	attr(mfs.samples, "varname") <- 'sample'
654
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
654
+	stopTimedMessage(ptm)
655 655
 
656 656
 	## Transform to data.frame
657 657
 	# Long format
... ...
@@ -766,15 +766,15 @@ heatmapGenomewide <- function(hmms, ylabels=NULL, classes=NULL, classes.color=NU
766 766
 	}
767 767
 
768 768
 	## Transform coordinates from "chr, start, end" to "genome.start, genome.end"
769
-	message("transforming coordinates ...", appendLF=FALSE); ptm <- proc.time()
769
+	ptm <- startTimedMessage("transforming coordinates ...")
770 770
 	grlred <- endoapply(grlred, transCoord)
771 771
 	if (plot.SCE) {
772 772
 		sce <- endoapply(sce, transCoord)
773 773
 	}
774
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
774
+	stopTimedMessage(ptm)
775 775
 
776 776
 	## Data.frame for plotting
777
-	message("Making the plot ...", appendLF=FALSE); ptm <- proc.time()
777
+	ptm <- startTimedMessage("Making the plot ...")
778 778
 	# Data
779 779
 	df <- list()
780 780
 	for (i1 in 1:length(grlred)) {
... ...
@@ -837,13 +837,13 @@ heatmapGenomewide <- function(hmms, ylabels=NULL, classes=NULL, classes.color=NU
837 837
 		widths[[length(widths)+1]] <- width.dendro
838 838
 	}
839 839
 	cowplot <- cowplot::plot_grid(plotlist=rev(pltlist), align='h', ncol=length(pltlist), rel_widths=rev(widths))
840
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
840
+	stopTimedMessage(ptm)
841 841
 
842 842
 	## Plot to file
843 843
 	if (!is.null(file)) {
844
-		message("plotting to file ",file," ...", appendLF=FALSE); ptm <- proc.time()
844
+		ptm <- startTimedMessage("plotting to file ",file," ...")
845 845
 		ggsave(file, cowplot, width=sum(widths)/2.54, height=height/2.54, limitsize=FALSE)
846
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
846
+		stopTimedMessage(ptm)
847 847
 	} else {
848 848
 		return(cowplot)
849 849
 	}
... ...
@@ -915,12 +915,12 @@ plot.array <- function(model, both.strands=FALSE, plot.SCE=TRUE, file=NULL) {
915 915
 	}
916 916
 
917 917
 	## Transform coordinates from "chr, start, end" to "genome.start, genome.end"
918
-	message("transforming coordinates ...", appendLF=FALSE); ptm <- proc.time()
918
+	ptm <- startTimedMessage("transforming coordinates ...")
919 919
 	bins <- transCoord(bins)
920 920
 	if (plot.SCE) {
921 921
 		scecoords <- transCoord(scecoords)
922 922
 	}
923
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
923
+	stopTimedMessage(ptm)
924 924
 
925 925
 	# Quality info
926 926
 	if (is.null(model$qualityInfo$complexity)) { model$qualityInfo$complexity <- NA }
... ...
@@ -97,9 +97,9 @@ clusterByQuality <- function(hmms, G=1:9, itmax=c(100,100), measures=c('spikynes
97 97
 	hmms <- loadHmmsFromFiles(hmms)
98 98
 	df <- getQC(hmms)
99 99
 	df <- df[measures]
100
-	message("clustering ...", appendLF=FALSE); ptm <- proc.time()
100
+	ptm <- startTimedMessage("clustering ...")
101 101
 	fit <- mclust::Mclust(df, G=G, control=emControl(itmax=itmax))
102
-	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
102
+	stopTimedMessage(ptm)
103 103
 	params <- t(fit$parameters$mean)
104 104
 	classification <- split(names(fit$classification), fit$classification)
105 105
 	## Reorder clusters
106 106
new file mode 100644
... ...
@@ -0,0 +1,86 @@
1
+
2
+
3
+#' Simulate reads from genome
4
+#' 
5
+#' Simulate single or paired end reads from any \pkg{\link[BSgenome]{BSgenome}} object. These simulated reads can be mapped to the reference genome using any aligner to produce BAM files that can be used for mappability correction.
6
+#' 
7
+#' Reads are simulated by splitting the genome into reads with the specified \code{readLength}.
8
+#' 
9
+#' @param bsgenome A \pkg{\link[BSgenome]{BSgenome}} object containing the sequence of the reference genome.
10
+#' @param readLength The length in base pairs of the simulated reads that are written to file.
11
+#' @param bamfile A BAM file. This file is used to estimate the distribution of Phred quality scores.
12
+#' @param file The filename that is written to disk. The ending .fastq.gz will be appended.
13
+#' @param pairedEndFragmentLength If this option is specified, paired end reads with length \code{readLength} will be simulated coming from both ends of fragments of this size. NOT IMPLEMENTED YET.
14
+#' @param every.X.bp Stepsize for simulating reads. A read fragment will be simulated every X bp.
15
+#' @return A fastq.gz file is written to disk.
16
+#' @author Aaron Taudt
17
+#' @importFrom Biostrings DNAStringSet BString BStringSet
18
+#' @export
19
+simulateReads <- function(bsgenome, readLength, bamfile, file, pairedEndFragmentLength=NULL, every.X.bp=500) {
20
+	
21
+	if (!is.null(pairedEndFragmentLength)) {
22
+		pairedEndReads <- TRUE
23
+	} else {
24
+		pairedEndReads <- FALSE
25
+	}
26
+	
27
+	## Estimate quality score distribution
28
+	ptm <- startTimedMessage("Estimating quality score distribution ...")
29
+	gr <- suppressMessages( bam2GRanges(bamfile, pairedEndReads=pairedEndReads, remove.duplicate.reads=FALSE, min.mapq=NULL, what=c('mapq','seq','qual')) )
30
+	seqbp <- unlist(strsplit(as.character(gr$seq),''))
31
+	mapbp <- unlist(strsplit(as.character(gr$qual),''))
32
+	qualdistr <- table(mapbp, seqbp)
33
+	qualdistr <- sweep(qualdistr, MARGIN=2, STATS=colSums(qualdistr), FUN='/')
34
+	remove(seqbp, mapbp)
35
+	stopTimedMessage(ptm)
36
+	
37
+	bases <- colnames(qualdistr)
38
+	missing.bases <- setdiff(bases, c('A','T','C','G','N'))
39
+	if (length(missing.bases)>0) {
40
+		stop("'bamfile' does not contain bases ", paste(missing.bases, collapse=', '))
41
+	}
42
+	
43
+	## Simulate the reads
44
+	file.gz <- gzfile(paste0(file,'.fastq.gz'), 'w')
45
+	cat('', file=file.gz)
46
+	identifier.string <- attributes(bsgenome)$pkgname
47
+	itotal <- 1
48
+	for (chrom in seqlevels(bsgenome)) {
49
+		message("Simulating reads for chromosome ",chrom)
50
+		## Simulate the reads
51
+		ptm <- startTimedMessage("  Simulating reads ...")
52
+		starts <- seq(1, seqlengths(bsgenome)[chrom]-readLength+1L, by=every.X.bp)
53
+		reads <- Biostrings::DNAStringSet(bsgenome[[chrom]], start=starts, width=readLength)
54
+		mask.N <- reads == paste0(rep('N', readLength), collapse='')
55
+		reads <- reads[!mask.N]
56
+		stopTimedMessage(ptm)
57
+		## Constructing quality fields
58
+		ptm <- startTimedMessage("  Constructing quality fields ...")
59
+		reads.full <- unlist(reads)
60
+		reads.split <- Biostrings::DNAStringSet(reads.full, start=1:length(reads.full), width=1)
61
+		quality.char <- character(length=length(reads.split))
62
+		for (base in bases) {
63
+			idx <- which(reads.split == base)
64
+			quality.char[idx] <- sample(rownames(qualdistr), size=length(idx), replace=TRUE, prob=qualdistr[,base])
65
+		}
66
+		quality <- Biostrings::BString(paste0(quality.char, collapse=''))
67
+		starts <- seq(1, length(reads.split)-readLength+1L, by=every.X.bp)
68
+		quality <- Biostrings::BStringSet(quality, start=starts, width=readLength)
69
+		remove(reads.full, reads.split)
70
+		stopTimedMessage(ptm)
71
+		## Construct ID fields
72
+		ptm <- startTimedMessage("  Constructing ID fields ...")
73
+		id.strings <- paste0(identifier.string, '_', itotal:(itotal+length(reads)-1), ' simulated length=',readLength)
74
+		itotal <- itotal + length(reads)
75
+		stopTimedMessage(ptm)
76
+		## Rearrange into @ID\nSeq\n+ID\nQual
77
+		ptm <- startTimedMessage("  Rearranging for writing ...")
78
+		data <- paste0('@',id.strings,'\n', as.character(reads),'\n', '+',id.strings,'\n', as.character(quality),'\n')
79
+		stopTimedMessage(ptm)
80
+		ptm <- startTimedMessage("  Writing to file ...")
81
+		cat(data, file=file.gz, sep='', append=T)
82
+		stopTimedMessage(ptm)
83
+	}
84
+	close(file.gz)
85
+
86
+}
0 87
new file mode 100644
... ...
@@ -0,0 +1,18 @@
1
+
2
+
3
+startTimedMessage <- function(...) {
4
+
5
+	x <- paste0(..., collapse='')
6
+	message(x, appendLF=FALSE)
7
+	ptm <- proc.time()
8
+	return(ptm)
9
+
10
+}
11
+
12
+
13
+stopTimedMessage <- function(ptm) {
14
+
15
+	time <- proc.time() - ptm
16
+	message(" ", round(time[3],2), "s")
17
+
18
+}
0 19
new file mode 100644
... ...
@@ -0,0 +1,42 @@
1
+
2
+
3
+#' Make variably sized bins
4
+#' 
5
+#' Make variable-width bins based on a reference BAM file. This can be a simulated file (produced by \code{\link{simulateReads}} and aligned with your favourite aligner) or a real reference.
6
+#' 
7
+#' Variable-width bins are produced by first binning the reference BAM file with fixed-width bins and selecting the desired number of reads per bin as the (non-zero) maximum of the histogram. A new set of bins is then generated such that every bin contains the desired number of reads.
8
+#' 
9
+#' @inheritParams bam2GRanges
10
+#' @param binsizes A vector with binsizes. Resulting bins will be close to the specified binsizes.
11
+#' @return A \code{list} with one \code{\link{GRanges}} object for each element in \code{binsizes}.
12
+#' @author Aaron Taudt
13
+#' @export
14
+variableWidthBins <- function(bamfile, bamindex=bamfile, binsizes, chromosomes=NULL, pairedEndReads=FALSE, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000) {
15
+	
16
+	## Make fixed width bins
17
+	reads <- bam2GRanges(bamfile, bamindex=bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, max.fragment.width=max.fragment.width)
18
+	ptm <- startTimedMessage(paste0("Binning ", bamfile, " ..."))
19
+	binned.list <- suppressMessages( bam2binned(reads, binsizes=binsizes, calc.complexity=FALSE, chromosomes=chromosomes) )
20
+	stopTimedMessage(ptm)
21
+	
22
+	## Loop over binsizes
23
+	bins.list <- list()
24
+	for (i1 in 1:length(binsizes)) {
25
+		binsize <- binsizes[i1]
26
+		binned <- binned.list[[i1]]
27
+		## Get mode of histogram
28
+		tab <- table(binned$counts)
29
+		modecount <- as.integer(names(which.max(tab[names(tab)!=0])))
30
+		## Pick only every modecount read
31
+		idx <- seq(modecount, length(reads), by=modecount)
32
+		subreads <- reads[idx]
33
+		strand(subreads) <- '*'
34
+		## Make new bins
35
+		bins <- gaps(subreads)
36
+		bins <- bins[strand(bins)=='*']
37
+		bins.list[[as.character(binsize)]] <- bins
38
+	}
39
+	
40
+	return(bins.list)
41
+
42
+}
... ...
@@ -1,14 +1,12 @@
1
-!!!THIS PROJECT IS CURRENTLY BEING DEVELOPED!!!