Browse code

improved SCE calling and added baumWelch/EM option

chakalakka authored on 11/10/2015 10:31:37
Showing 26 changed files

... ...
@@ -7,8 +7,6 @@ S3method(plot,character)
7 7
 export(Aneufinder)
8 8
 export(bam2binned)
9 9
 export(bed2GRanges)
10
-export(bed2binned)
11
-export(bedGraph2binned)
12 10
 export(clusterByQuality)
13 11
 export(concGRangesInRange)
14 12
 export(correctGC)
... ...
@@ -9,29 +9,27 @@
9 9
 #' @param configfile A file specifying the parameters of this function (without \code{inputfolder}, \code{outputfolder} and \code{configfile}). Having the parameters in a file can be handy if many samples with the same parameter settings are to be run. If a \code{configfile} is specified, it will take priority over the command line parameters.
10 10
 #' @param numCPU The numbers of CPUs that are used. Should not be more than available on your machine.
11 11
 #' @param reuse.existing.files A logical indicating whether or not existing files in \code{outputfolder} should be reused.
12
-#' @param binsizes An integer vector with bin sizes. If more than one value is given, output files will be produced for each bin size.
13
-#' @param reads.per.bin Approximate number of desired reads per bin. The bin size will be selected accordingly. Output files are produced for each value.
14
-#' @param format Format of the sequencing data. One of \code{c('bam')}.
15
-#' @param chromosomes If only a subset of the chromosomes should be binned, specify them here.
16
-#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed.
17
-#' @param min.mapq Minimum mapping quality when importing from BAM files.
12
+
13
+#' @param stepsize Fraction of the binsize that the sliding window is offset at each step. Example: If \code{stepsize=0.1} and \code{binsizes=c(200000,500000)}, the actual stepsize in basepairs is 20000 and 50000, respectively.
14
+#' @inheritParams align2binned
15
+
18 16
 #' @param correction.method Correction methods to be used for the binned read counts. Currently any combination of \code{c('GC')}.
19 17
 #' @param GC.BSgenome A \code{BSgenome} object which contains the DNA sequence that is used for the GC correction.
20 18
 #' @param callCNVs A logical indicating whether CNVs should be called.
21 19
 #' @param callSCEs A logical indicating whether SCEs should be called.
22
-#' @param eps Convergence threshold for the Baum-Welch algorithm.
23
-#' @param max.time The maximum running time in seconds for the Baum-Welch algorithm. If this time is reached, the Baum-Welch will terminate after the current iteration finishes. \code{max.time=-1} means no limit.
24
-#' @param max.iter The maximum number of iterations for the Baum-Welch algorithm. \code{max.iter=-1} means no limit.
25
-#' @param num.trials The number of trials to find a fit where state \code{most.frequent.state} is most frequent. Each time, the HMM is seeded with different random initial values.
26
-#' @param states A subset or all of \code{c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")}. This vector defines the states that are used in the Hidden Markov Model. The order of the entries should not be changed.
20
+
21
+#' @inheritParams univariate.findCNVs
27 22
 #' @param most.frequent.state One of the states that were given in \code{states}. The specified state is assumed to be the most frequent one when calling CNVs. This can help the fitting procedure to converge into the correct fit. Default is 'disomy'.
28 23
 #' @param most.frequent.state.SCE One of the states that were given in \code{states}. The specified state is assumed to be the most frequent one when calling SCEs. This can help the fitting procedure to converge into the correct fit. Default is 'monosomy'.
24
+
25
+#' @inheritParams getSCEcoordinates
26
+
29 27
 #' @param cluster.plots A logical indicating whether plots should be clustered by similarity.
30 28
 #' @author Aaron Taudt
31 29
 #' @import foreach
32 30
 #' @import doParallel
33 31
 #' @export
34
-Aneufinder <- function(inputfolder, outputfolder, configfile=NULL, numCPU=1, reuse.existing.files=TRUE, binsizes=500000, reads.per.bin=NULL, format='bam', chromosomes=NULL, remove.duplicate.reads=TRUE, min.mapq=10, correction.method=NULL, GC.BSgenome=NULL, callCNVs=TRUE, callSCEs=FALSE, eps=0.1, max.time=60, max.iter=5000, num.trials=15, states=c('zero-inflation','nullsomy','monosomy','disomy','trisomy','tetrasomy','multisomy'), most.frequent.state='disomy', most.frequent.state.SCE='monosomy', cluster.plots=TRUE) {
32
+Aneufinder <- function(inputfolder, outputfolder, configfile=NULL, numCPU=1, reuse.existing.files=TRUE, binsizes=500000, reads.per.bin=NULL, stepsize=NULL, format='bam', chromosomes=NULL, remove.duplicate.reads=TRUE, min.mapq=10, correction.method=NULL, GC.BSgenome=NULL, callCNVs=TRUE, callSCEs=FALSE, eps=0.1, max.time=60, max.iter=5000, num.trials=15, states=c('zero-inflation','nullsomy','monosomy','disomy','trisomy','tetrasomy','multisomy'), most.frequent.state='disomy', most.frequent.state.SCE='monosomy', resolution=c(3,6), min.segwidth=2, min.reads=50, cluster.plots=TRUE) {
35 33
 
36 34
 #=======================
37 35
 ### Helper functions ###
... ...
@@ -63,7 +61,7 @@ if (class(GC.BSgenome)=='BSgenome') {
63 61
 }
64 62
 
65 63
 ## Put options into list and merge with conf
66
-params <- list(numCPU=numCPU, reuse.existing.files=reuse.existing.files, binsizes=binsizes, reads.per.bin=reads.per.bin, format=format, chromosomes=chromosomes, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, correction.method=correction.method, GC.BSgenome=GC.BSgenome, callCNVs=callCNVs, callSCEs=callSCEs, eps=eps, max.time=max.time, max.iter=max.iter, num.trials=num.trials, states=states, most.frequent.state=most.frequent.state, most.frequent.state.SCE=most.frequent.state.SCE, cluster.plots=cluster.plots)
64
+params <- list(numCPU=numCPU, reuse.existing.files=reuse.existing.files, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, format=format, chromosomes=chromosomes, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, correction.method=correction.method, GC.BSgenome=GC.BSgenome, callCNVs=callCNVs, callSCEs=callSCEs, eps=eps, max.time=max.time, max.iter=max.iter, num.trials=num.trials, states=states, most.frequent.state=most.frequent.state, most.frequent.state.SCE=most.frequent.state.SCE, resolution=resolution, min.segwidth=min.segwidth, min.reads=min.reads, cluster.plots=cluster.plots)
67 65
 conf <- c(conf, params[setdiff(names(params),names(conf))])
68 66
 
69 67
 ## Input checks
... ...
@@ -74,7 +72,7 @@ if (! conf[['format']] %in% c('bam')) {
74 72
 ## Helpers
75 73
 binsizes <- conf[['binsizes']]
76 74
 reads.per.bins <- conf[['reads.per.bin']]
77
-patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=F, trim=T),'_'))
75
+patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=T, trim=T),'_'))
78 76
 patterns <- setdiff(patterns, c('reads.per.bin__','binsize__'))
79 77
 pattern <- NULL #ease R CMD check
80 78
 
... ...
@@ -123,7 +121,7 @@ temp <- foreach (file = files, .packages=c('aneufinder')) %dopar% {
123 121
 	if (length(c(binsizes.todo,rpbin.todo)) > 0) {
124 122
 		tC <- tryCatch({
125 123
 			if (conf[['format']]=='bam') {
126
-				bam2binned(bamfile=file, binsizes=binsizes.todo, reads.per.bin=rpbin.todo, chromosomes=conf[['chromosomes']], remove.duplicate.reads=conf[['remove.duplicate.reads']], min.mapq=conf[['min.mapq']], outputfolder.binned=binpath.uncorrected, save.as.RData=TRUE, reads.store=TRUE, outputfolder.reads=readspath)
124
+				bam2binned(bamfile=file, binsizes=binsizes.todo, reads.per.bin=rpbin.todo, stepsize=conf[['stepsize']], chromosomes=conf[['chromosomes']], remove.duplicate.reads=conf[['remove.duplicate.reads']], min.mapq=conf[['min.mapq']], outputfolder.binned=binpath.uncorrected, save.as.RData=TRUE, reads.store=TRUE, outputfolder.reads=readspath)
127 125
 			}
128 126
 		}, error = function(err) {
129 127
 			stop(file,'\n',err)
... ...
@@ -217,7 +215,7 @@ if (conf[['callCNVs']]==TRUE) {
217 215
 	### Plotting CNV ###
218 216
 	#===================
219 217
 	if (!file.exists(CNVplotpath)) { dir.create(CNVplotpath) }
220
-	patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=F, trim=T),'_'))
218
+	patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=T, trim=T),'_'))
221 219
 	patterns <- setdiff(patterns, c('reads.per.bin__','binsize__'))
222 220
 	files <- list.files(CNVpath, full.names=TRUE, recursive=T, pattern='.RData$')
223 221
 
... ...
@@ -230,7 +228,7 @@ if (conf[['callCNVs']]==TRUE) {
230 228
 		if (!file.exists(savename)) {
231 229
 			pdf(file=savename, width=10, height=7)
232 230
 			ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE)
233
-			ifiles <- grep(pattern, ifiles, value=T)
231
+			ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
234 232
 			for (ifile in ifiles) {
235 233
 				tC <- tryCatch({
236 234
 					model <- get(load(ifile))
... ...
@@ -249,7 +247,7 @@ if (conf[['callCNVs']]==TRUE) {
249 247
 	message("Plotting heatmaps ...", appendLF=F); ptm <- proc.time()
250 248
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
251 249
 		ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE)
252
-		ifiles <- grep(pattern, ifiles, value=T)
250
+		ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
253 251
 		savename=file.path(CNVplotpath,paste0('genomeHeatmap_',sub('_$','',pattern),'.pdf'))
254 252
 		if (!file.exists(savename)) {
255 253
 			suppressMessages(heatmapGenomewide(ifiles, file=savename, plot.SCE=F, cluster=conf[['cluster.plots']]))
... ...
@@ -257,7 +255,7 @@ if (conf[['callCNVs']]==TRUE) {
257 255
 	}
258 256
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
259 257
 		ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE)
260
-		ifiles <- grep(pattern, ifiles, value=T)
258
+		ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
261 259
 		savename=file.path(CNVplotpath,paste0('aneuploidyHeatmap_',sub('_$','',pattern),'.pdf'))
262 260
 		if (!file.exists(savename)) {
263 261
 			ggplt <- suppressMessages(heatmapAneuploidies(ifiles, cluster=conf[['cluster.plots']]))
... ...
@@ -276,7 +274,7 @@ if (conf[['callCNVs']]==TRUE) {
276 274
 		if (!file.exists(savename)) {
277 275
 			pdf(file=savename, width=20, height=5)
278 276
 			ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE)
279
-			ifiles <- grep(pattern, ifiles, value=T)
277
+			ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
280 278
 			for (ifile in ifiles) {
281 279
 				tC <- tryCatch({
282 280
 					model <- get(load(ifile))
... ...
@@ -298,7 +296,7 @@ if (conf[['callCNVs']]==TRUE) {
298 296
 		if (!file.exists(savename)) {
299 297
 			pdf(file=savename, width=12*1.4, height=2*4.6)
300 298
 			ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE)
301
-			ifiles <- grep(pattern, ifiles, value=T)
299
+			ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
302 300
 			for (ifile in ifiles) {
303 301
 				tC <- tryCatch({
304 302
 					model <- get(load(ifile))
... ...
@@ -320,7 +318,7 @@ if (conf[['callCNVs']]==TRUE) {
320 318
 		savename <- file.path(CNVbrowserpath,paste0(pattern))
321 319
 		if (!file.exists(paste0(savename,'.bed.gz'))) {
322 320
 			ifiles <- list.files(CNVpath, pattern='RData$', full.names=TRUE)
323
-			ifiles <- grep(pattern, ifiles, value=T)
321
+			ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
324 322
 			exportCNVs(ifiles, filename=savename, cluster=conf[['cluster.plots']], export.CNV=TRUE, export.SCE=FALSE)
325 323
 		}
326 324
 	}
... ...
@@ -340,7 +338,9 @@ if (conf[['callSCEs']]==TRUE) {
340 338
 		tC <- tryCatch({
341 339
 			savename <- file.path(SCEpath,basename(file))
342 340
 			if (!file.exists(savename)) {
343
-				model <- findSCEs(file, eps=conf[['eps']], max.time=conf[['max.time']], max.iter=conf[['max.iter']], num.trials=conf[['num.trials']], states=conf[['states']], most.frequent.state=conf[['most.frequent.state.SCE']]) 
341
+				binned.data <- get(load(file))
342
+				reads.file <- list.files(readspath, full.names=T, pattern=paste0('^',attr(binned.data,'ID'),'.RData$'))
343
+				model <- findSCEs(file, eps=conf[['eps']], max.time=conf[['max.time']], max.iter=conf[['max.iter']], num.trials=conf[['num.trials']], states=conf[['states']], most.frequent.state=conf[['most.frequent.state.SCE']], resolution=conf[['resolution']], min.segwidth=conf[['min.segwidth']], fragments=reads.file, min.reads=conf[['min.reads']]) 
344 344
 				save(model, file=savename)
345 345
 			}
346 346
 		}, error = function(err) {
... ...
@@ -353,7 +353,7 @@ if (conf[['callSCEs']]==TRUE) {
353 353
 	### Plotting SCE ###
354 354
 	#===================
355 355
 	if (!file.exists(SCEplotpath)) { dir.create(SCEplotpath) }
356
-	patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=F, trim=T),'_'))
356
+	patterns <- c(paste0('reads.per.bin_',reads.per.bins,'_'), paste0('binsize_',format(binsizes, scientific=T, trim=T),'_'))
357 357
 	patterns <- setdiff(patterns, c('reads.per.bin__','binsize__'))
358 358
 	files <- list.files(SCEpath, full.names=TRUE, recursive=T, pattern='.RData$')
359 359
 
... ...
@@ -366,7 +366,7 @@ if (conf[['callSCEs']]==TRUE) {
366 366
 		if (!file.exists(savename)) {
367 367
 			pdf(file=savename, width=10, height=7)
368 368
 			ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE)
369
-			ifiles <- grep(pattern, ifiles, value=T)
369
+			ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
370 370
 			for (ifile in ifiles) {
371 371
 				tC <- tryCatch({
372 372
 					model <- get(load(ifile))
... ...
@@ -385,7 +385,7 @@ if (conf[['callSCEs']]==TRUE) {
385 385
 	message("Plotting heatmaps ...", appendLF=F); ptm <- proc.time()
386 386
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
387 387
 		ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE)
388
-		ifiles <- grep(pattern, ifiles, value=T)
388
+		ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
389 389
 		savename=file.path(SCEplotpath,paste0('genomeHeatmap_',sub('_$','',pattern),'.pdf'))
390 390
 		if (!file.exists(savename)) {
391 391
 			suppressMessages(heatmapGenomewide(ifiles, file=savename, plot.SCE=T, cluster=conf[['cluster.plots']]))
... ...
@@ -393,7 +393,7 @@ if (conf[['callSCEs']]==TRUE) {
393 393
 	}
394 394
 	temp <- foreach (pattern = patterns, .packages=c('aneufinder')) %dopar% {
395 395
 		ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE)
396
-		ifiles <- grep(pattern, ifiles, value=T)
396
+		ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
397 397
 		savename=file.path(SCEplotpath,paste0('aneuploidyHeatmap_',sub('_$','',pattern),'.pdf'))
398 398
 		if (!file.exists(savename)) {
399 399
 			pdf(savename, width=30, height=0.3*length(ifiles))
... ...
@@ -412,7 +412,7 @@ if (conf[['callSCEs']]==TRUE) {
412 412
 		if (!file.exists(savename)) {
413 413
 			pdf(file=savename, width=20, height=5)
414 414
 			ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE)
415
-			ifiles <- grep(pattern, ifiles, value=T)
415
+			ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
416 416
 			for (ifile in ifiles) {
417 417
 				tC <- tryCatch({
418 418
 					model <- get(load(ifile))
... ...
@@ -434,7 +434,7 @@ if (conf[['callSCEs']]==TRUE) {
434 434
 		if (!file.exists(savename)) {
435 435
 			pdf(file=savename, width=12*1.4, height=2*4.6)
436 436
 			ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE)
437
-			ifiles <- grep(pattern, ifiles, value=T)
437
+			ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
438 438
 			for (ifile in ifiles) {
439 439
 				tC <- tryCatch({
440 440
 					model <- get(load(ifile))
... ...
@@ -456,7 +456,7 @@ if (conf[['callSCEs']]==TRUE) {
456 456
 		savename <- file.path(SCEbrowserpath,sub('_$','',pattern))
457 457
 		if (!file.exists(paste0(savename,'.bed.gz'))) {
458 458
 			ifiles <- list.files(SCEpath, pattern='RData$', full.names=TRUE)
459
-			ifiles <- grep(pattern, ifiles, value=T)
459
+			ifiles <- grep(gsub('\\+','\\\\+',pattern), ifiles, value=T)
460 460
 			exportCNVs(ifiles, filename=savename, cluster=conf[['cluster.plots']], export.CNV=TRUE, export.SCE=TRUE)
461 461
 		}
462 462
 	}
... ...
@@ -2,10 +2,9 @@
2 2
 
3 3
 #' Convert aligned reads from various file formats into read counts in equidistant bins
4 4
 #'
5
-#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows. Convert signal values in .bedGraph format to signal counts in equidistant windows.
5
+#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows.
6 6
 #'
7 7
 #' Convert aligned reads or signal values from various file formats into read counts in equidistant windows (bins). bam2binned() and bed2binned() use 'GenomicRanges::countOverlaps()' to calculate the read counts. Therefore, with small binsizes and large read lengths, one read fragment will often overlap more than one bin.
8
-#' bedGraph2binned() sets the maximum signal value in a bin as value for that bin.
9 8
 #'
10 9
 #' @name binning
11 10
 #' @examples
... ...
@@ -21,13 +20,13 @@ NULL
21 20
 #' @param bamfile A file in BAM format.
22 21
 #' @param bamindex BAM index file. Can be specified without the .bai ending. If the index file does not exist it will be created and a warning is issued.
23 22
 #' @export
24
-bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEndReads=FALSE, outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, numbins=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
23
+bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEndReads=FALSE, outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
25 24
 	call <- match.call()
26 25
 	underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='')
27 26
 	message("\n",call[[1]],"():")
28 27
 	message(underline)
29 28
 	ptm <- proc.time()
30
-	binned.data <- align2binned(bamfile, format="bam", ID=ID, bamindex=bamindex, pairedEndReads=pairedEndReads, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, numbins=numbins, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
29
+	binned.data <- align2binned(bamfile, format="bam", ID=ID, bamindex=bamindex, pairedEndReads=pairedEndReads, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
31 30
 	time <- proc.time() - ptm
32 31
 	message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s")
33 32
 	return(binned.data)
... ...
@@ -36,30 +35,13 @@ bam2binned <- function(bamfile, bamindex=bamfile, ID=basename(bamfile), pairedEn
36 35
 #' @describeIn binning Bin reads in BED format
37 36
 #' @inheritParams align2binned
38 37
 #' @param bedfile A file in BED format.
39
-#' @export
40
-bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, numbins=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
38
+bed2binned <- function(bedfile, chrom.length.file, ID=basename(bedfile), outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
41 39
 	call <- match.call()
42 40
 	underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='')
43 41
 	message("\n",call[[1]],"():")
44 42
 	message(underline)
45 43
 	ptm <- proc.time()
46
-	binned.data <- align2binned(bedfile, format="bed", ID=ID, chrom.length.file=chrom.length.file, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, numbins=numbins, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
47
-	time <- proc.time() - ptm
48
-	message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s")
49
-	return(binned.data)
50
-}
51
-
52
-#' @describeIn binning Bin reads in bedGraph format
53
-#' @inheritParams align2binned
54
-#' @param bedGraphfile A file in bedGraph format.
55
-#' @export
56
-bedGraph2binned <- function(bedGraphfile, chrom.length.file, ID=basename(bedGraphfile), outputfolder.binned="binned_data", binsizes=NULL, reads.per.bin=NULL, numbins=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
57
-	call <- match.call()
58
-	underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='')
59
-	message("\n",call[[1]],"():")
60
-	message(underline)
61
-	ptm <- proc.time()
62
-	binned.data <- align2binned(bedGraphfile, format="bedGraph", ID=ID, chrom.length.file=chrom.length.file, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, numbins=numbins, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
44
+	binned.data <- align2binned(bedfile, format="bed", ID=ID, chrom.length.file=chrom.length.file, outputfolder.binned=outputfolder.binned, binsizes=binsizes, reads.per.bin=reads.per.bin, stepsize=stepsize, chromosomes=chromosomes, save.as.RData=save.as.RData, calc.complexity=calc.complexity, min.mapq=min.mapq, remove.duplicate.reads=remove.duplicate.reads, call=call, reads.store=reads.store, outputfolder.reads=outputfolder.reads, reads.return=reads.return, reads.overwrite=reads.overwrite, reads.only=reads.only)
63 45
 	time <- proc.time() - ptm
64 46
 	message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s")
65 47
 	return(binned.data)
... ...
@@ -67,10 +49,10 @@ bedGraph2binned <- function(bedGraphfile, chrom.length.file, ID=basename(bedGrap
67 49
 
68 50
 #' Convert aligned reads from various file formats into read counts in equidistant bins
69 51
 #'
70
-#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows. Convert signal values in .bedGraph format to signal counts in equidistant windows.
52
+#' Convert aligned reads in .bam or .bed format into read counts in equidistant windows.
71 53
 #'
72 54
 #' @param file A file with aligned reads.
73
-#' @param format One of \code{c('bam', 'bed', 'bedGraph')}.
55
+#' @param format One of \code{c('bam', 'bed')}.
74 56
 #' @param ID An identifier that will be used to identify the file throughout the workflow and in plotting.
75 57
 #' @param bamindex Index file if \code{format='bam'} with or without the .bai ending. If this file does not exist it will be created and a warning is issued.
76 58
 #' @param pairedEndReads Set to \code{TRUE} if you have paired-end reads in your file.
... ...
@@ -78,7 +60,7 @@ bedGraph2binned <- function(bedGraphfile, chrom.length.file, ID=basename(bedGrap
78 60
 #' @param outputfolder.binned Folder to which the binned data will be saved. If the specified folder does not exist, it will be created.
79 61
 #' @param binsizes An integer vector with bin sizes. If more than one value is given, output files will be produced for each bin size.
80 62
 #' @param reads.per.bin Approximate number of desired reads per bin. The bin size will be selected accordingly. Output files are produced for each value.
81
-#' @param numbins Number of bins per chromosome. Each chromosome will have a different binsize! DO NOT USE UNLESS YOU KNOW WHAT YOU ARE DOING. Output files are produced for each value.
63
+#' @param stepsize Fraction of the binsize that the sliding window is offset at each step. Example: If \code{stepsize=0.1} and \code{binsizes=c(200000,500000)}, the actual stepsize in basepairs is 20000 and 50000, respectively.
82 64
 #' @param chromosomes If only a subset of the chromosomes should be binned, specify them here.
83 65
 #' @param save.as.RData If set to \code{FALSE}, no output file will be written. Instead, a \link{GenomicRanges} object containing the binned data will be returned. Only the first binsize will be processed in this case.
84 66
 #' @param calc.complexity A logical indicating whether or not to estimate library complexity.
... ...
@@ -91,7 +73,7 @@ bedGraph2binned <- function(bedGraphfile, chrom.length.file, ID=basename(bedGrap
91 73
 #' @param reads.overwrite Whether or not an existing file with read fragments should be overwritten.
92 74
 #' @param reads.only If \code{TRUE} only read fragments are stored and/or returned and no binning is done.
93 75
 #' @return The function produces a \link{GRanges} object with one meta data column 'reads' that contains the read count. This binned data will be either written to file (\code{save.as.RData=FALSE}) or given as return value (\code{save.as.RData=FALSE}).
94
-align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedEndReads=FALSE, chrom.length.file, outputfolder.binned="binned_data", binsizes=200000, reads.per.bin=NULL, numbins=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, call=match.call(), reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
76
+align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedEndReads=FALSE, chrom.length.file, outputfolder.binned="binned_data", binsizes=200000, reads.per.bin=NULL, stepsize=NULL, chromosomes=NULL, save.as.RData=FALSE, calc.complexity=TRUE, min.mapq=10, remove.duplicate.reads=TRUE, call=match.call(), reads.store=FALSE, outputfolder.reads="data", reads.return=FALSE, reads.overwrite=FALSE, reads.only=FALSE) {
95 77
 
96 78
 	## Check user input
97 79
 	if (reads.return==FALSE & reads.only==FALSE) {
... ...
@@ -137,21 +119,6 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE
137 119
 			}
138 120
 			data <- data[mcols(data)$mapq >= min.mapq]
139 121
 		}
140
-	## BEDGraph (0-based)
141
-	} else if (format == "bedGraph") {
142
-		# File with chromosome lengths (1-based)
143
-		chrom.lengths.df <- read.table(chrom.length.file)
144
-		chrom.lengths <- chrom.lengths.df[,2]
145
-		names(chrom.lengths) <- chrom.lengths.df[,1]
146
-		# File with reads, determine classes first for faster import
147
-		tab5rows <- read.table(file, nrows=5)
148
-		classes.in.bed <- sapply(tab5rows, class)
149
-		classes <- rep("NULL",length(classes.in.bed))
150
-		classes[1:4] <- classes.in.bed[1:4]
151
-		data <- read.table(file, colClasses=classes)
152
-		# Convert to GRanges object
153
-		data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]), strand=Rle(strand("*"), nrow(data)), signal=data[,4])	# start+1 to go from [0,x) -> [1,x]
154
-		seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))])
155 122
 	}
156 123
 	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
157 124
 
... ...
@@ -208,60 +175,44 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE
208 175
 	coverage <- list(coverage=coverage, genome.covered=genome.covered, coverage.per.chrom=coverage.per.chrom, genome.covered.per.chrom=genome.covered.per.chrom)
209 176
 
210 177
 
211
-	### Loop over all binsizes ###
212 178
 	## Pad binsizes and reads.per.bin with each others value
213 179
 	numcountsperbp <- length(data) / sum(as.numeric(seqlengths(data)))
214 180
 	binsizes.add <- round(reads.per.bin / numcountsperbp, -2)
215 181
 	reads.per.bin.add <- round(binsizes * numcountsperbp, 2)
216 182
 	binsizes <- c(binsizes, binsizes.add)
217 183
 	reads.per.bin <- c(reads.per.bin.add, reads.per.bin)
218
-	if (is.null(numbins)) {
219
-		length.binsizes <- length(binsizes)
220
-	} else {
221
-		length.binsizes <- length(numbins)
222
-	}
184
+	length.binsizes <- length(binsizes)
223 185
 
186
+	### Loop over all binsizes ###
187
+	data.plus <- data[strand(data)=='+']
188
+	data.minus <- data[strand(data)=='-']
224 189
 	for (ibinsize in 1:length.binsizes) {
225
-		if (is.null(numbins)) {
226
-			binsize <- binsizes[ibinsize]
227
-			readsperbin <- reads.per.bin[ibinsize]
228
-			message("Binning into bin size ",binsize," with on average ",readsperbin," reads per bin")
229
-		} else {
230
-			numbin <- numbins[ibinsize]
231
-			message("Binning with number of bins ",numbin)
232
-		}
190
+		binsize <- binsizes[ibinsize]
191
+		readsperbin <- reads.per.bin[ibinsize]
192
+		message("Binning into bin size ",binsize," with on average ",readsperbin," reads per bin")
233 193
 
234
-		### Iterate over all chromosomes
194
+		### Loop over chromosomes ###
235 195
 		message("  binning genome ...", appendLF=F); ptm <- proc.time()
236 196
 		binned.data <- GenomicRanges::GRangesList()
237 197
 		for (chromosome in chroms2use) {
238 198
 
239
-			if (is.null(numbins)) {
240
-				## Check last incomplete bin
241
-				incomplete.bin <- seqlengths(data)[chromosome] %% binsize > 0
242
-				if (incomplete.bin) {
243
-					numbin <- floor(seqlengths(data)[chromosome]/binsize)	# floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end
244
-				} else {
245
-					numbin <- seqlengths(data)[chromosome]/binsize
246
-				}
247
-				if (numbin == 0) {
248
-					warning("chromosome ",chromosome," is smaller than binsize. Skipped.")
249
-					next
250
-				}
251
-				## Initialize vectors
252
-				chroms <- rep(chromosome,numbin)
253
-				reads <- rep(0,numbin)
254
-				start <- seq(from=1, by=binsize, length.out=numbin)
255
-				end <- seq(from=binsize, by=binsize, length.out=numbin)
256
-# 	 			end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired
199
+			## Check last incomplete bin
200
+			incomplete.bin <- seqlengths(data)[chromosome] %% binsize > 0
201
+			if (incomplete.bin) {
202
+				numbin <- floor(seqlengths(data)[chromosome]/binsize)	# floor: we don't want incomplete bins, ceiling: we want incomplete bins at the end
257 203
 			} else {
258
-				## Initialize vectors
259
-				chroms <- rep(chromosome,numbin)
260
-				start <- round(seq(from=1, to=seqlengths(data)[chromosome], length.out=numbin+1))
261
-				end <- start[-1] - 1
262
-				end[length(end)] <- end[length(end)] + 1
263
-				start <- start[-length(start)]
204
+				numbin <- seqlengths(data)[chromosome]/binsize
264 205
 			}
206
+			if (numbin == 0) {
207
+				warning("chromosome ",chromosome," is smaller than binsize. Skipped.")
208
+				next
209
+			}
210
+			## Initialize vectors
211
+			chroms <- rep(chromosome,numbin)
212
+			reads <- rep(0,numbin)
213
+			start <- seq(from=1, by=binsize, length.out=numbin)
214
+			end <- seq(from=binsize, by=binsize, length.out=numbin)
215
+# 			end[length(end)] <- seqlengths(data)[chromosome] # last ending coordinate is size of chromosome, only if incomplete bins are desired
265 216
 
266 217
 			## Create binned chromosome as GRanges object
267 218
 			i.binned.data <- GenomicRanges::GRanges(seqnames = Rle(chromosome, numbin),
... ...
@@ -273,44 +224,35 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE
273 224
 			suppressWarnings(
274 225
 				binned.data[[chromosome]] <- i.binned.data
275 226
 			)
276
-		}
227
+
228
+		} ### end loop chromosomes ###
277 229
 		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
278 230
 		binned.data <- unlist(binned.data)
279 231
 		names(binned.data) <- NULL
280 232
 
281
-		## Count overlaps
282
-		message("  counting overlaps ...", appendLF=F); ptm <- proc.time()
283
-		if (format=="bam" | format=="bed") {
284
-			mcounts <- GenomicRanges::countOverlaps(binned.data, data[strand(data)=='-'])
285
-			pcounts <- GenomicRanges::countOverlaps(binned.data, data[strand(data)=='+'])
286
-			counts <- mcounts + pcounts
287
-		} else if (format=="bedGraph") {
288
-			# Take the max value from all regions that fall into / overlap a given bin as read counts
289
-			midx <- as.matrix(findOverlaps(binned.data, data))
290
-			counts <- rep(0,length(binned.data))
291
-			signal <- mcols(data)$signal
292
-			rle <- rle(midx[,1])
293
-			read.idx <- rle$values
294
-			max.idx <- cumsum(rle$lengths)
295
-			maxvalues <- rep(NA, length(read.idx))
296
-			maxvalues[1] <- max(signal[midx[1:(max.idx[1]),2]])
297
-			for (i1 in 2:length(read.idx)) {
298
-				maxvalues[i1] <- max(signal[midx[(max.idx[i1-1]+1):(max.idx[i1]),2]])
299
-			}
300
-			counts[read.idx] <- maxvalues
301
-			mcounts <- rep(NA, length(counts))
302
-			pcounts <- rep(NA, length(counts))
303
-		}
304
-		if (is.null(numbins)) {
305
-			mcols(binned.data)$counts <- counts
306
-			mcols(binned.data)$mcounts <- mcounts
307
-			mcols(binned.data)$pcounts <- pcounts
233
+		### Loop over offsets ###
234
+		countmatrices <- list()
235
+		if (is.null(stepsize)) {
236
+			offsets <- 0
308 237
 		} else {
309
-			mcols(binned.data)$counts <- counts / (end - start)
310
-			mcols(binned.data)$mcounts <- mcounts / (end - start)
311
-			mcols(binned.data)$pcounts <- pcounts / (end - start)
238
+			offsets <- seq(from=0, to=binsize, by=as.integer(stepsize*binsize))
312 239
 		}
313
-		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
240
+		for (ioff in offsets) {
241
+			## Count overlaps
242
+			message(paste0("  counting overlaps for offset ",ioff," ..."), appendLF=F); ptm <- proc.time()
243
+			binned.data.shift <- suppressWarnings( shift(binned.data, shift=ioff) )
244
+			if (format=="bam" | format=="bed") {
245
+				mcounts <- suppressWarnings( GenomicRanges::countOverlaps(binned.data.shift, data.minus) )
246
+				pcounts <- suppressWarnings( GenomicRanges::countOverlaps(binned.data.shift, data.plus) )
247
+				counts <- mcounts + pcounts
248
+			}
249
+			countmatrix <- matrix(c(counts,mcounts,pcounts), ncol=3)
250
+			colnames(countmatrix) <- c('counts','mcounts','pcounts')
251
+			countmatrices[[as.character(ioff)]] <- countmatrix
252
+			time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
253
+		}
254
+		mcols(binned.data) <- as(countmatrices[['0']],'DataFrame')
255
+		attr(binned.data,'offset.counts') <- countmatrices
314 256
 
315 257
 		if (length(binned.data) == 0) {
316 258
 			warning(paste0("The bin size of ",binsize," with reads per bin ",reads.per.bin," is larger than any of the chromosomes."))
... ...
@@ -328,11 +270,7 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE
328 270
 		### Save or return the binned data ###
329 271
 		if (save.as.RData==TRUE) {
330 272
 			# Save to file
331
-			if (is.null(numbins)) {
332
-				filename <- paste0(ID,"_binsize_",format(binsize, scientific=T, trim=T),"_reads.per.bin_",readsperbin,"_.RData")
333
-			} else {
334
-				filename <- paste0(ID,"_numbin_",format(numbin, scientific=T, trim=T),"_.RData")
335
-			}
273
+			filename <- paste0(ID,"_binsize_",format(binsize, scientific=T, trim=T),"_reads.per.bin_",readsperbin,"_.RData")
336 274
 			message("Saving to file ...", appendLF=F); ptm <- proc.time()
337 275
 # 			attr(binned.data, 'call') <- call # do not store along with GRanges because it inflates disk usage
338 276
 			save(binned.data, file=file.path(outputfolder.binned,filename) )
... ...
@@ -342,7 +280,7 @@ align2binned <- function(file, format, ID=basename(file), bamindex=file, pairedE
342 280
 			return(binned.data)
343 281
 		}
344 282
 
345
-	}
283
+	} ### end loop binsizes ###
346 284
 
347 285
 
348 286
 }
... ...
@@ -37,10 +37,15 @@ insertchr <- function(hmm.gr) {
37 37
 exportCNVs <- function(hmm.list, filename, cluster=TRUE, export.CNV=TRUE, export.SCE=TRUE) {
38 38
 
39 39
 	## Get segments and SCE coordinates
40
-	temp <- getSegments(hmm.list, cluster=cluster, getSCE=export.SCE)
40
+	hmm.list <- loadHmmsFromFiles(hmm.list)
41
+	temp <- getSegments(hmm.list, cluster=cluster)
41 42
 	hmm.grl <- temp$segments
43
+	if (cluster) {
44
+		hmm.list <- hmm.list[temp$clustering$order]
45
+	}
42 46
 	if (export.SCE) {
43
-		sce <- temp$sce
47
+		sce <- lapply(hmm.list,'[[','sce')
48
+		names(sce) <- lapply(hmm.list,'[[','ID')
44 49
 		sce <- sce[!unlist(lapply(sce, is.null))]
45 50
 		sce <- sce[lapply(sce, length)!=0]		
46 51
 		if (length(sce)==0) {
... ...
@@ -6,8 +6,8 @@
6 6
 #'
7 7
 #' \code{findCNVs} uses a 6-state Hidden Markov Model to classify the binned read counts: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper
8 8
 #' @author Aaron Taudt
9
-#' @param method Currently only \code{c('univariate')}.
10 9
 #' @inheritParams univariate.findCNVs
10
+#' @param method One of \code{c('univariate','bivariate')}. In the univariate case strand information is discarded, while in the bivariate case strand information is used for the fitting.
11 11
 #' @examples
12 12
 #'## Get an example BAM file with single-cell-sequencing reads
13 13
 #'bamfile <- system.file("extdata/BB140820_I_002.bam", package="aneufinder")
... ...
@@ -18,7 +18,7 @@
18 18
 #'## Check the fit
19 19
 #'plot(model, type='histogram')
20 20
 #' @export
21
-findCNVs <- function(binned.data, ID=NULL, method='univariate', eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=15, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="disomy") {
21
+findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=15, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="disomy", method="univariate", algorithm="EM", allow.odd.states=TRUE, initial.params=NULL) {
22 22
 
23 23
 	## Intercept user input
24 24
 	if (class(binned.data) != 'GRanges') {
... ...
@@ -37,8 +37,10 @@ findCNVs <- function(binned.data, ID=NULL, method='univariate', eps=0.1, init="s
37 37
 	ptm <- proc.time()
38 38
 	message("Find CNVs for ID = ",ID, ":")
39 39
 
40
-	if (method=='univariate') {
41
-		model <- univariate.findCNVs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, strand=strand, states=states, most.frequent.state=most.frequent.state)
40
+	if (method == 'univariate') {
41
+		model <- univariate.findCNVs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, strand=strand, states=states, most.frequent.state=most.frequent.state, algorithm=algorithm, initial.params=initial.params)
42
+	} else if (method == 'bivariate') {
43
+		model <- bivariate.findCNVs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, allow.odd.states=allow.odd.states, states=states, most.frequent.state=most.frequent.state, initial.params=initial.params)
42 44
 	}
43 45
 
44 46
 	attr(model, 'call') <- call
... ...
@@ -70,8 +72,10 @@ findCNVs <- function(binned.data, ID=NULL, method='univariate', eps=0.1, init="s
70 72
 #' @param strand Run the HMM only for the specified strand. One of \code{c('+', '-', '*')}.
71 73
 #' @param states A subset or all of \code{c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")}. This vector defines the states that are used in the Hidden Markov Model. The order of the entries should not be changed.
72 74
 #' @param most.frequent.state One of the states that were given in \code{states} or 'none'. The specified state is assumed to be the most frequent one. This can help the fitting procedure to converge into the correct fit.
75
+#' @param algorithm One of \code{c('baumWelch','EM')}. The expectation maximization (\code{'EM'}) will find the most likely states and fit the best parameters to the data, the \code{'baumWelch'} will find the most likely states using the initial parameters.
76
+#' @param initial.params A \code{\link{aneuHMM}} object or file containing such an object from which initial starting parameters will be extracted.
73 77
 #' @return An \code{\link{aneuHMM}} object.
74
-univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="disomy") {
78
+univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="disomy", algorithm="EM", initial.params=NULL) {
75 79
 
76 80
 	### Define cleanup behaviour ###
77 81
 	on.exit(.C("R_univariate_cleanup"))
... ...
@@ -94,6 +98,23 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
94 98
 	if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer")
95 99
 	if (check.strand(strand)!=0) stop("argument 'strand' expects either '+', '-' or '*'")
96 100
 	if (!most.frequent.state %in% c(states,'none')) stop("argument 'most.frequent.state' must be one of c(",paste(c(states,'none'), collapse=","),")")
101
+	if (!algorithm %in% c('baumWelch','EM')) {
102
+		stop("argument 'algorithm' expects one of c('baumWelch','EM')")
103
+	}
104
+	if (algorithm == 'baumWelch' & num.trials>1) {
105
+		warning("Set 'num.trials <- 1' because 'algorithm==\"baumWelch\"'.")
106
+		num.trials <- 1
107
+	}
108
+	initial.params <- loadHmmsFromFiles(initial.params)[[1]]
109
+	if (class(initial.params)!=class.univariate.hmm & !is.null(initial.params)) {
110
+		stop("argument 'initial.params' expects a ",class.univariate.hmm," object or file that contains such an object")
111
+	}
112
+	if (algorithm == 'baumWelch' & is.null(initial.params)) {
113
+		warning("'initial.params' should be specified if 'algorithm=\"baumWelch\"")
114
+	}
115
+	if (!is.null(initial.params)) {
116
+		init <- 'initial.params'
117
+	}
97 118
 
98 119
 	warlist <- list()
99 120
 	if (is.null(eps.try) | num.trials==1) eps.try <- eps
... ...
@@ -102,6 +123,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
102 123
 	temp <- initializeStates(states)
103 124
 	state.labels <- temp$labels
104 125
 	state.distributions <- temp$distributions
126
+	multiplicity <- temp$multiplicity
105 127
 	dependent.states.mask <- state.labels %in% c("monosomy","disomy","trisomy","tetrasomy","multisomy")
106 128
 	numstates <- length(states)
107 129
 	numbins <- length(binned.data)
... ...
@@ -114,6 +136,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
114 136
 		select <- 'counts'
115 137
 	}
116 138
 	counts <- mcols(binned.data)[,select]
139
+	algorithm <- factor(algorithm, levels=c('baumWelch','viterbi','EM'))
117 140
 
118 141
 	### Make return object
119 142
 		result <- list()
... ...
@@ -121,10 +144,10 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
121 144
 		result$ID <- ID
122 145
 		result$bins <- binned.data
123 146
 	## Quality info
124
-		qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins, 'complexity.preseqR'), bhattacharyya=NA)
147
+		qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=NA)
125 148
 		result$qualityInfo <- qualityInfo
126 149
 
127
-	# Check if there are reads in the data, otherwise HMM will blow up
150
+	# Check if there are counts in the data, otherwise HMM will blow up
128 151
 	if (any(is.na(counts))) {
129 152
 		stop(paste0("ID = ",ID,": NAs found in reads."))
130 153
 	}
... ...
@@ -156,13 +179,24 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
156 179
 		message(paste0("Trial ",i_try," / ",num.trials))
157 180
 
158 181
 		## Initial parameters
159
-		if (init == 'random') {
182
+		if (init == 'initial.params') {
183
+			A.initial <- initial.params$transitionProbs
184
+			proba.initial <- initial.params$startProbs
185
+			size.initial <- initial.params$distributions[,'size']
186
+			prob.initial <- initial.params$distributions[,'prob']
187
+			size.initial[is.na(size.initial)] <- 0
188
+			prob.initial[is.na(prob.initial)] <- 0
189
+		} else if (init == 'random') {
160 190
 			A.initial <- matrix(runif(numstates^2), ncol=numstates)
161 191
 			A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/")			
162 192
 			proba.initial <- runif(numstates)
163 193
 			# Distributions for dependent states
164 194
 			size.initial <- runif(1, min=0, max=100) * cumsum(dependent.states.mask)
165 195
 			prob.initial <- runif(1) * dependent.states.mask
196
+			# Assign initials for the nullsomy distribution
197
+			index <- which('nullsomy'==state.labels)
198
+			size.initial[index] <- 1
199
+			prob.initial[index] <- 0.5
166 200
 		} else if (init == 'standard') {
167 201
 			A.initial <- matrix(NA, ncol=numstates, nrow=numstates)
168 202
 			for (irow in 1:numstates) {
... ...
@@ -172,8 +206,9 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
172 206
 				}
173 207
 			}
174 208
 			proba.initial <- rep(1/numstates, numstates)
175
-			mean.initial.monosomy <- mean(counts[counts>0])/2
176
-			var.initial.monosomy <- var(counts[counts>0])/2
209
+			divf <- max(multiplicity[most.frequent.state], 1)
210
+			mean.initial.monosomy <- mean(counts[counts>0])/divf
211
+			var.initial.monosomy <- var(counts[counts>0])/divf
177 212
 			if (is.na(mean.initial.monosomy)) {
178 213
 				mean.initial.monosomy <- 1
179 214
 			}
... ...
@@ -197,11 +232,11 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
197 232
 				size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
198 233
 				prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
199 234
 			}
235
+			# Assign initials for the nullsomy distribution
236
+			index <- which('nullsomy'==state.labels)
237
+			size.initial[index] <- 1
238
+			prob.initial[index] <- 0.5
200 239
 		}
201
-		# Assign initials for the nullsomy distribution
202
-		index <- which('nullsomy'==state.labels)
203
-		size.initial[index] <- 1
204
-		prob.initial[index] <- 0.5
205 240
 	
206 241
 		hmm <- .C("R_univariate_hmm",
207 242
 			counts = as.integer(counts), # int* O
... ...
@@ -226,7 +261,8 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
226 261
 			use.initial.params = as.logical(1), # bool* use_initial_params
227 262
 			num.threads = as.integer(num.threads), # int* num_threads
228 263
 			error = as.integer(0), # int* error (error handling)
229
-			count.cutoff = as.integer(count.cutoff) # int* count.cutoff
264
+			count.cutoff = as.integer(count.cutoff), # int* count.cutoff
265
+			algorithm = as.integer(algorithm) # int* algorithm
230 266
 		)
231 267
 
232 268
 		hmm$eps <- eps.try
... ...
@@ -237,13 +273,6 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
237 273
 			# Store model in list
238 274
 			hmm$counts <- NULL
239 275
 			modellist[[as.character(i_try)]] <- hmm
240
-# 			# Check if most.frequent.state is more than half of actual most frequent state and stop trials
241
-# 			if (most.frequent.state %in% state.labels) {
242
-# 				imostfrequent <- which(state.labels==most.frequent.state)
243
-# 				if (hmm$weights[imostfrequent] / max(hmm$weights) > 0.5) {
244
-# 					break
245
-# 				}
246
-# 			}
247 276
 			init <- 'random'
248 277
 		} else if (num.trials == 1) {
249 278
 			if (hmm$loglik.delta > eps) {
... ...
@@ -303,7 +332,8 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
303 332
 				use.initial.params = as.logical(1), # bool* use_initial_params
304 333
 				num.threads = as.integer(num.threads), # int* num_threads
305 334
 				error = as.integer(0), # int* error (error handling)
306
-				count.cutoff = as.integer(count.cutoff) # int* count.cutoff
335
+				count.cutoff = as.integer(count.cutoff), # int* count.cutoff
336
+				algorithm = as.integer(algorithm)
307 337
 			)
308 338
 		}
309 339
 
... ...
@@ -346,9 +376,9 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
346 376
 			result$transitionProbs.initial <- transitionProbs.initial
347 377
 			# Initial probs
348 378
 			result$startProbs <- hmm$proba
349
-			names(result$startProbs) <- paste0("P(",state.labels,")")
379
+			names(result$startProbs) <- state.labels
350 380
 			result$startProbs.initial <- hmm$proba.initial
351
-			names(result$startProbs.initial) <- paste0("P(",state.labels,")")
381
+			names(result$startProbs.initial) <- state.labels
352 382
 			# Distributions
353 383
 				distributions <- data.frame()
354 384
 				distributions.initial <- data.frame()
... ...
@@ -376,7 +406,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
376 406
 			convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error)
377 407
 			result$convergenceInfo <- convergenceInfo
378 408
 		## Quality info
379
-			qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins, 'complexity.preseqR'), bhattacharyya=qc.bhattacharyya(result))
409
+			qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=qc.bhattacharyya(result))
380 410
 			result$qualityInfo <- qualityInfo
381 411
 		} else if (hmm$error == 1) {
382 412
 			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
... ...
@@ -389,9 +419,426 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
389 419
 	## Issue warnings
390 420
 	result$warnings <- warlist
391 421
 
392
-#   return(list(result, modellist))
393 422
 	## Return results
394 423
 	return(result)
395 424
 }
396 425
 
397 426
 
427
+#' Find copy number variations (bivariate)
428
+#'
429
+#' \code{bivariate.findCNVs} finds CNVs using read count information from both strands.
430
+#'
431
+#' @inheritParams univariate.findCNVs
432
+#' @param allow.odd.states If set to \code{TRUE}, all states are allowed. If \code{FALSE}, only states which have an even multiplicity (plus nullsomy-monosomy states) are allowed, e.g. disomy-disomy, monosomy-trisomy.
433
+bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, allow.odd.states=TRUE, states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="monosomy", algorithm='EM', initial.params=NULL) {
434
+
435
+	## Intercept user input
436
+	if (class(binned.data) != 'GRanges') {
437
+		binned.data <- get(load(binned.data))
438
+		if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'counts' or a file that contains such an object")
439
+	}
440
+	if (is.null(ID)) {
441
+		ID <- attr(binned.data, 'ID')
442
+	}
443
+	if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric")
444
+	if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer")
445
+	if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer")
446
+	if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer")
447
+	if (!is.null(eps.try)) {
448
+		if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric")
449
+	}
450
+	if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer")
451
+	initial.params <- loadHmmsFromFiles(initial.params)[[1]]
452
+	if (class(initial.params)!=class.bivariate.hmm & !is.null(initial.params)) {
453
+		stop("argument 'initial.params' expects a ",class.bivariate.hmm," object or file that contains such an object")
454
+	}
455
+	if (algorithm == 'baumWelch' & is.null(initial.params)) {
456
+		warning("'initial.params' should be specified if 'algorithm=\"baumWelch\"")
457
+	}
458
+	if (!is.null(initial.params)) {
459
+		init <- 'initial.params'
460
+	}
461
+
462
+	warlist <- list()
463
+	if (is.null(eps.try)) eps.try <- eps
464
+
465
+	## Variables
466
+	num.bins <- length(binned.data)
467
+	temp <-  initializeStates(states)
468
+	multiplicity <- temp$multiplicity
469
+	state.labels <- temp$labels
470
+	algorithm <- factor(algorithm, levels=c('baumWelch','viterbi','EM'))
471
+
472
+	## Get counts
473
+	select <- 'counts'
474
+	counts <- matrix(c(mcols(binned.data)[,paste0('m',select)], mcols(binned.data)[,paste0('p',select)]), ncol=2, dimnames=list(bin=1:num.bins, strand=c('minus','plus')))
475
+	maxcounts <- max(counts)
476
+
477
+	## Filter high counts out, makes HMM faster
478
+	count.cutoff <- quantile(counts, count.cutoff.quantile)
479
+	names.count.cutoff <- names(count.cutoff)
480
+	count.cutoff <- ceiling(count.cutoff)
481
+	mask <- counts > count.cutoff
482
+	counts[mask] <- count.cutoff
483
+	numfiltered <- length(which(mask))
484
+	if (numfiltered > 0) {
485
+		message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to increase the speed of the HMM and should not affect the results.\n"))
486
+	}
487
+	
488
+	### Make return object
489
+		result <- list()
490
+		class(result) <- class.bivariate.hmm
491
+		result$ID <- ID
492
+		result$bins <- binned.data
493
+	## Quality info
494
+		qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=NA)
495
+		result$qualityInfo <- qualityInfo
496
+
497
+	# Check if there are counts in the data, otherwise HMM will blow up
498
+	if (!any(counts!=0)) {
499
+		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done."))
500
+		result$warnings <- warlist
501
+		return(result)
502
+	} else if (any(counts<0)) {
503
+		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done."))
504
+		result$warnings <- warlist
505
+		return(result)
506
+	}
507
+
508
+	if (init=='initial.params') {
509
+		uni.transitionProbs <- initial.params$univariateParams$transitionProbs
510
+		uni.startProbs <- initial.params$univariateParams$startProbs
511
+		distributions <- initial.params$distributions
512
+		uni.weights <- initial.params$univariateParams$weights
513
+		uni.states <- names(uni.weights)
514
+		num.uni.states <- length(uni.states)
515
+		num.models <- length(distributions)
516
+		num.bins <- length(initial.params$bins)
517
+		comb.states <- factor(names(initial.params$startProbs), levels=names(initial.params$startProbs))
518
+		num.comb.states <- length(comb.states)
519
+		states.list <- list(minus=initial.params$bins$mstate, plus=initial.params$bins$pstate)
520
+		comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states))
521
+		A.initial <- initial.params$transitionProbs
522
+		proba.initial <- initial.params$startProbs
523
+		use.initial <- TRUE
524
+	} else {
525
+		### Stack the strands and run one univariate findCNVs
526
+		message("")
527
+		message(paste(rep('-',getOption('width')), collapse=''))
528
+		message("Running univariate")
529
+		binned.data.minus <- binned.data
530
+		strand(binned.data.minus) <- '-'
531
+		binned.data.minus$counts <- binned.data.minus$mcounts
532
+		binned.data.plus <- binned.data
533
+		strand(binned.data.plus) <- '+'
534
+		binned.data.plus$counts <- binned.data.plus$pcounts
535
+		binned.data.stacked <- c(binned.data.minus, binned.data.plus)
536
+		mask.attributes <- c('qualityInfo', 'ID', 'min.mapq')
537
+		attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes]
538
+
539
+		model.stacked <- univariate.findCNVs(binned.data.stacked, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=1, states=states, most.frequent.state=most.frequent.state)
540
+		model.minus <- model.stacked
541
+		model.minus$bins <- model.minus$bins[strand(model.minus$bins)=='-']
542
+		model.minus$segments <- model.minus$segments[strand(model.minus$segments)=='-']
543
+		model.plus <- model.stacked
544
+		model.plus$bins <- model.plus$bins[strand(model.plus$bins)=='+']
545
+		model.plus$segments <- model.plus$segments[strand(model.plus$segments)=='+']
546
+
547
+		models <- list(minus=model.minus, plus=model.plus)
548
+
549
+		## Extract counts and other stuff
550
+		uni.transitionProbs <- lapply(models, '[[', 'transitionProbs')[[1]]
551
+		uni.startProbs <- lapply(models, '[[', 'startProbs')[[1]]
552
+		distributions <- lapply(models, '[[', 'distributions')
553
+		uni.weights <- lapply(models, '[[', 'weights')[[1]]
554
+		uni.states <- names(uni.weights)
555
+		num.uni.states <- length(uni.states)
556
+		num.models <- length(distributions)
557
+		num.bins <- length(models[[1]]$bins)
558
+		comb.states <- vector()
559
+		levels.state <- levels(models[[1]]$bins$state)
560
+		for (i1 in 1:length(levels.state)) {
561
+			for (i2 in 1:length(levels.state)) {
562
+				comb.state <- paste(levels.state[i1], levels.state[i2])
563
+				if (allow.odd.states) {
564
+					comb.states[length(comb.states)+1] <- comb.state
565
+				} else {
566
+					state.multiplicity <- multiplicity[levels.state[i1]] + multiplicity[levels.state[i2]]
567
+					# Only even states and null-mono states (for sex chromosomes)
568
+					if (state.multiplicity %% 2 == 0 | state.multiplicity == 1) {
569
+						comb.states[length(comb.states)+1] <- comb.state
570
+					}
571
+				}
572
+			}
573
+		}
574
+		comb.states <- factor(comb.states, levels=comb.states)
575
+		num.comb.states <- length(comb.states)
576
+		states.list <- lapply(models, function(x) { x$bins$state })
577
+		comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states))
578
+		A.initial <- double(length=num.comb.states*num.comb.states)
579
+		proba.initial <- double(length=num.comb.states)
580
+		use.initial <- FALSE
581
+	}
582
+
583
+	### Prepare the multivariate HMM
584
+	message("")
585
+	message(paste(rep('-',getOption('width')), collapse=''))
586
+	message("Preparing bivariate HMM\n")
587
+
588
+	## Pre-compute z-values for each number of counts
589
+	message("Computing pre z-matrix...", appendLF=F); ptm <- proc.time()
590
+	z.per.count <- array(NA, dim=c(maxcounts+1, num.models, num.uni.states), dimnames=list(counts=0:maxcounts, strand=names(distributions), state=uni.states))
591
+	xcounts <- 0:maxcounts
592
+	for (istrand in 1:num.models) {
593
+		for (istate in 1:num.uni.states) {
594
+			if (distributions[[istrand]][istate,'type']=='dnbinom') {
595
+				size <- distributions[[istrand]][istate,'size']
596
+				prob <- distributions[[istrand]][istate,'prob']
597
+				u <- pnbinom(xcounts, size, prob)
598
+			} else if (distributions[[istrand]][istate,'type']=='delta') {
599
+				u <- rep(1, length(xcounts))
600
+			} else if (distributions[[istrand]][istate,'type']=='dgeom') {
601
+				prob <- distributions[[istrand]][istate,'prob']
602
+				u <- pgeom(xcounts, prob)
603
+			}
604
+			qnorm_u <- qnorm(u)
605
+			mask <- qnorm_u==Inf
606
+			qnorm_u[mask] <- qnorm(1-1e-16)
607
+			z.per.count[ , istrand, istate] <- qnorm_u
608
+		}
609
+	}
610
+	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
611
+
612
+	## Compute the z matrix
613
+	message("Transfering values into z-matrix...", appendLF=F); ptm <- proc.time()
614
+	z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(distributions), state=uni.states))
615
+	for (istrand in 1:num.models) {
616
+		for (istate in 1:num.uni.states) {
617
+			z.per.bin[ , istrand, istate] = z.per.count[counts[,istrand]+1, istrand, istate]
618
+		}
619
+	}
620
+	remove(z.per.count)
621
+	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
622
+
623
+	## Calculate correlation matrix
624
+	message("Computing inverse of correlation matrix...", appendLF=F); ptm <- proc.time()
625
+	correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(distributions), strand=names(distributions), comb.state=comb.states))
626
+	correlationMatrixInverse <- correlationMatrix
627
+	determinant <- rep(0, num.comb.states)
628
+	names(determinant) <- comb.states
629
+	usestateTF <- rep(NA, num.comb.states) # TRUE, FALSE vector for usable states
630
+	names(usestateTF) <- comb.states
631
+	for (comb.state in comb.states) {
632
+		state <- strsplit(comb.state, ' ')[[1]]
633
+		mask <- which(comb.states.per.bin==comb.state)
634
+		# Subselect z
635
+		z.temp <- matrix(NA, ncol=num.models, nrow=length(mask))
636
+		for (istrand in 1:num.models) {
637
+			z.temp[,istrand] <- z.per.bin[mask, istrand, state[istrand]]
638
+		}
639
+		temp <- tryCatch({
640
+			if (nrow(z.temp) > 1) {
641
+				correlationMatrix[,,comb.state] <- cor(z.temp)
642
+				determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
643
+				correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
644
+				usestateTF[comb.state] <- TRUE
645
+			} else {
646
+				correlationMatrix[,,comb.state] <- diag(num.models)
647
+				determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
648
+				correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
649
+				usestateTF[comb.state] <- TRUE
650
+			}
651
+		}, warning = function(war) {
652
+			correlationMatrix[,,comb.state] <<- diag(num.models)
653
+			determinant[comb.state] <<- det( correlationMatrix[,,comb.state] )
654
+			correlationMatrixInverse[,,comb.state] <<- solve(correlationMatrix[,,comb.state])
655
+			usestateTF[comb.state] <<- TRUE
656
+			war
657
+		}, error = function(err) {
658
+			correlationMatrix[,,comb.state] <<- diag(num.models)
659
+			determinant[comb.state] <<- det( correlationMatrix[,,comb.state] )
660
+			correlationMatrixInverse[,,comb.state] <<- solve(correlationMatrix[,,comb.state])
661
+			usestateTF[comb.state] <<- TRUE
662
+			err
663
+		})
664
+	}
665
+	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
666
+
667
+	# Use only states with valid correlationMatrixInverse (all states are valid in this version)
668
+	correlationMatrix = correlationMatrix[,,usestateTF]
669
+	correlationMatrixInverse = correlationMatrixInverse[,,usestateTF]
670
+	comb.states = comb.states[usestateTF]
671
+	comb.states <- droplevels(comb.states)
672
+	determinant = determinant[usestateTF]
673
+	num.comb.states <- length(comb.states)
674
+
675
+	### Calculate multivariate densities for each state ###
676
+	message("Calculating multivariate densities...", appendLF=F); ptm <- proc.time()
677
+	densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states))
678
+	for (comb.state in comb.states) {
679
+		istate <- which(comb.state==comb.states)
680
+		state <- strsplit(comb.state, ' ')[[1]]
681
+		z.temp <- matrix(NA, ncol=num.models, nrow=num.bins)
682
+		product <- 1
683
+		for (istrand in 1:num.models) {
684
+			z.temp[,istrand] <- z.per.bin[, istrand, state[istrand]]
685
+			if (distributions[[istrand]][state[istrand],'type'] == 'dnbinom') {
686
+				size <- distributions[[istrand]][state[istrand],'size']
687
+				prob <- distributions[[istrand]][state[istrand],'prob']
688
+				product <- product * dnbinom(counts[,istrand], size, prob)
689
+			} else if (distributions[[istrand]][state[istrand],'type'] == 'dgeom') {
690
+				prob <- distributions[[istrand]][state[istrand],'prob']
691
+				product <- product * dgeom(counts[,istrand], prob)
692
+			} else if (distributions[[istrand]][state[istrand],'type'] == 'delta') {
693
+				product <- product * ifelse(counts[,istrand]==0, 1, 0)
694
+			}
695
+		}
696
+		exponent <- -0.5 * apply( ( z.temp %*% (correlationMatrixInverse[ , , istate] - diag(num.models)) ) * z.temp, 1, sum)
697
+		densities[,istate] <- product * determinant[istate]^(-0.5) * exp( exponent )
698
+	}
699
+	# Check if densities are > 1
700
+# 	if (any(densities>1)) stop("Densities > 1")
701
+# 	if (any(densities<0)) stop("Densities < 0")
702
+	densities[densities>1] <- 1
703
+	densities[densities<0] <- 0
704
+	# Check if densities are 0 everywhere in some bins
705
+	check <- which(apply(densities, 1, sum) == 0)
706
+	if (length(check)>0) {
707
+		if (check[1]==1) {
708
+			densities[1,] <- rep(1e-10, ncol(densities))
709
+			check <- check[-1]
710
+		}
711
+		for (icheck in check) {
712
+			densities[icheck,] <- densities[icheck-1,]
713
+		}
714
+	}
715
+	time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
716
+		
717
+	### Define cleanup behaviour ###
718
+	on.exit(.C("R_multivariate_cleanup", as.integer(num.comb.states)))
719
+
720
+	### Run the multivariate HMM
721
+	# Call the C function
722
+	hmm <- .C("R_multivariate_hmm",
723
+		densities = as.double(densities), # double* D
724
+		num.bins = as.integer(num.bins), # int* T
725
+		num.comb.states = as.integer(num.comb.states), # int* N
726
+		num.strands = as.integer(num.models), # int* Nmod
727
+		comb.states = as.integer(comb.states), # int* comb_states
728
+		num.iterations = as.integer(max.iter), # int* maxiter
729
+		time.sec = as.integer(max.time), # double* maxtime
730
+		loglik.delta = as.double(eps), # double* eps
731
+		states = integer(length=num.bins), # int* states
732
+		A = double(length=num.comb.states*num.comb.states), # double* A
733
+		proba = double(length=num.comb.states), # double* proba
734
+		loglik = double(length=1), # double* loglik
735
+		A.initial = as.vector(A.initial), # double* initial_A
736
+		proba.initial = as.vector(proba.initial), # double* initial_proba
737
+		use.initial.params = as.logical(use.initial), # bool* use_initial_params
738
+		num.threads = as.integer(num.threads), # int* num_threads
739
+		error = as.integer(0), # error handling
740
+		algorithm = as.integer(algorithm) # int* algorithm
741
+		)
742
+			
743
+	### Check convergence ###
744
+	war <- NULL
745
+	if (hmm$loglik.delta > eps) {
746
+		war <- warning(paste0("ID = ",ID,": HMM did not converge!\n"))
747
+	}
748
+	if (hmm$error == 1) {
749
+		warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
750
+	} else if (hmm$error == 2) {
751
+		warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
752
+	}
753
+
754
+	### Make return object ###
755
+	if (hmm$error == 0) {
756
+		result <- list()
757
+		class(result) <- class.bivariate.hmm
758
+		result$ID <- ID
759
+	## Bin coordinates and states
760
+		result$bins <- binned.data
761
+		result$bins$state <- comb.states[hmm$states]
762
+		# Get states as factors in data.frame
763
+		matrix.states <- matrix(unlist(strsplit(as.character(result$bins$state), split=' ')), byrow=T, ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(distributions)))
764
+		names <- c('mstate','pstate')
765
+		for (i1 in 1:num.models) {
766
+			mcols(result$bins)[names[i1]] <- factor(matrix.states[,i1], levels=uni.states)
767
+		}
768
+	## Segmentation
769
+		message("Making segmentation ...", appendLF=F); ptm <- proc.time()
770
+		gr <- result$bins
771
+		red.gr.list <- GRangesList()
772
+		for (state in comb.states) {
773
+			red.gr <- GenomicRanges::reduce(gr[gr$state==state])
774
+			if (length(red.gr)>0) {
775
+				mcols(red.gr)$state <- rep(factor(state, levels=levels(gr$state)),length(red.gr))
776
+				red.gr.list[[length(red.gr.list)+1]] <- red.gr
777
+			}
778
+		}
779
+		red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list))
780
+		result$segments <- red.gr
781
+		seqlengths(result$segments) <- seqlengths(result$bins)
782
+		time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
783
+	## CNV state for both strands combined
784
+		# Bins
785
+		state <- multiplicity[result$bins$mstate] + multiplicity[result$bins$pstate]
786
+		state[state>max(multiplicity)] <- max(multiplicity)
787
+		multiplicity.inverse <- names(multiplicity)
788
+		names(multiplicity.inverse) <- multiplicity
789
+		state <- multiplicity.inverse[as.character(state)]
790
+		state[(result$bins$mstate=='nullsomy' | result$bins$pstate=='nullsomy') & state=='zero-inflation'] <- 'nullsomy'
791
+    result$bins$state <- factor(state, levels=names(multiplicity))
792
+		# Segments
793
+		str <- strsplit(as.character(result$segments$state),' ')
794
+		result$segments$mstate <- factor(unlist(lapply(str, '[[', 1)), levels=state.labels)
795
+		result$segments$pstate <- factor(unlist(lapply(str, '[[', 2)), levels=state.labels)
796
+		state <- multiplicity[result$segments$mstate] + multiplicity[result$segments$pstate]
797
+		state[state>max(multiplicity)] <- max(multiplicity)
798
+		multiplicity.inverse <- names(multiplicity)
799
+		names(multiplicity.inverse) <- multiplicity
800
+		state <- multiplicity.inverse[as.character(state)]
801
+		state[(result$segments$mstate=='nullsomy' | result$segments$pstate=='nullsomy') & state=='zero-inflation'] <- 'nullsomy'
802
+    result$segments$state <- factor(state, levels=names(multiplicity))
803
+		## Parameters
804
+			# Weights
805
+			tstates <- table(result$bins$state)
806
+			result$weights <- tstates/sum(tstates)
807
+			# Transition matrices
808
+			result$transitionProbs <- matrix(hmm$A, ncol=num.comb.states)
809
+			colnames(result$transitionProbs) <- comb.states
810
+			rownames(result$transitionProbs) <- comb.states
811
+			result$transitionProbs.initial <- matrix(hmm$A.initial, ncol=num.comb.states)
812
+			colnames(result$transitionProbs.initial) <- comb.states
813
+			rownames(result$transitionProbs.initial) <- comb.states
814
+			# Initial probs
815
+			result$startProbs <- hmm$proba
816
+			names(result$startProbs) <- comb.states
817
+			result$startProbs.initial <- hmm$proba.initial
818
+			names(result$startProbs.initial) <- comb.states
819
+			# Distributions
820
+			result$distributions <- distributions
821
+		## Convergence info
822
+			convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec)
823
+			result$convergenceInfo <- convergenceInfo
824
+		## Quality info
825
+			qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins,'qualityInfo')$complexity$preseqR, bhattacharyya=qc.bhattacharyya(result))
826
+			result$qualityInfo <- qualityInfo
827
+		## Univariate infos
828
+			univariateParams <- list(transitionProbs=uni.transitionProbs, startProbs=uni.startProbs, distributions=distributions[[1]], weights=uni.weights)
829
+			result$univariateParams <- univariateParams
830
+	} else if (hmm$error == 1) {
831
+		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
832
+	} else if (hmm$error == 2) {
833
+		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
834
+	}
835
+
836
+	## Issue warnings
837
+	result$warnings <- warlist
838
+
839
+	## Return results
840
+	return(result)
841
+}
842
+
843
+
844
+
... ...
@@ -4,10 +4,11 @@
4 4
 #'
5 5
 #' \code{findSCEs} classifies the binned read counts into several states which represent the number of chromatids on each strand.
6 6
 #'
7
-#' \code{findSCEs} uses a 6-state Hidden Markov Model to classify the binned read counts: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper
7
+#' \code{findSCEs} uses a Hidden Markov Model to classify the binned read counts: state 'zero-inflation' with a delta function as emission densitiy (only zero read counts), 'nullsomy' with geometric distribution, 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A expectation-maximization (EM) algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper
8 8
 #' @author Aaron Taudt
9
-#' @inheritParams univariate.findSCEs
10
-#' @inheritParams bivariate.findSCEs
9
+#' @inheritParams univariate.findCNVs
10
+#' @inheritParams bivariate.findCNVs
11
+#' @inheritParams getSCEcoordinates
11 12
 #' @return An \code{\link{aneuBiHMM}} object.
12 13
 #' @examples
13 14
 #'## Get an example BAM file with single-cell-sequencing reads
... ...
@@ -19,7 +20,7 @@
19 20
 #'## Check the fit
20 21
 #'plot(model, type='histogram')
21 22
 #' @export
22
-findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=15, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', allow.odd.states=TRUE, states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="monosomy") {
23
+findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=5, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', allow.odd.states=TRUE, states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="monosomy", algorithm="EM", initial.params=NULL, resolution=c(3,6), min.segwidth=2, fragments=NULL, min.reads=50) {
23 24
 
24 25
 	## Intercept user input
25 26
 	if (class(binned.data) != 'GRanges') {
... ...
@@ -38,7 +39,23 @@ findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1
38 39
 	ptm <- proc.time()
39 40
 	message("Find CNVs for ID = ",ID, ":")
40 41
 
41
-	model <- bivariate.findSCEs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, allow.odd.states=allow.odd.states, states=states, most.frequent.state=most.frequent.state)
42
+	model <- bivariate.findCNVs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, allow.odd.states=allow.odd.states, states=states, most.frequent.state=most.frequent.state, algorithm=algorithm, initial.params=initial.params)
43
+	model$sce <- getSCEcoordinates(model, resolution=resolution, min.segwidth=min.segwidth, fragments=fragments, min.reads=min.reads)
44
+	
45
+# 	## Find CNV calls for offset counts using the parameters from the normal run
46
+# 	offsets <- setdiff(names(attr(binned.data,'offset.counts')), 0)
47
+# 	if (!is.null(offsets)) {
48
+# 		offset.models <- list()
49
+# 		for (ioff in offsets) {
50
+# 			message(paste0("Finding SCE for offset ",ioff))
51
+# 			off.counts <- attr(binned.data,'offset.counts')[[as.character(ioff)]]
52
+# 			off.binned.data <- binned.data
53
+# 			mcols(off.binned.data)[names(mcols(binned.data)) %in% names(off.counts)] <- as(off.counts, 'DataFrame')
54
+# 			off.model <- suppressMessages( bivariate.findCNVs(off.binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=count.cutoff.quantile, allow.odd.states=allow.odd.states, states=states, most.frequent.state=most.frequent.state, algorithm='baumWelch', initial.params=model) )
55
+# 			offset.models[[as.character(ioff)]] <- off.model
56
+# 		}
57
+# 	}
58
+# 	return(offset.models)
42 59
 
43 60
 	attr(model, 'call') <- call
44 61
 	time <- proc.time() - ptm
... ...
@@ -48,740 +65,6 @@ findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1
48 65
 }
49 66
 
50 67
 
51
-#' Find copy number variations (univariate)
52
-#'
53
-#' \code{univariate.findSCEs} classifies the input binned read counts into several states which represent copy-number-variation.
54
-#'
55
-#' \code{univariate.findSCEs} is almost the same as \code{findCNVs}. However, it has slighlty different parameter settings and variable preparations because it is used by \code{bivariate.findSCEs} for CNV calling on both strands separately.
56
-#'
57
-#' @param binned.data A \link{GRanges} object with binned read counts.
58
-#' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.
59
-#' @param eps Convergence threshold for the Baum-Welch algorithm.
60
-#' @param init One of the following initialization procedures:
61
-#'	\describe{
62
-#'		\item{\code{standard}}{The negative binomial of state 'disomy' will be initialized with \code{mean=mean(counts)}, \code{var=var(counts)}. This procedure usually gives good convergence.}
63
-#'		\item{\code{random}}{Mean and variance of the negative binomial of state 'disomy' will be initialized with random values (in certain boundaries, see source code). Try this if the \code{standard} procedure fails to produce a good fit.}
64
-#'	}
65
-#' @param max.time The maximum running time in seconds for the Baum-Welch algorithm. If this time is reached, the Baum-Welch will terminate after the current iteration finishes. The default -1 is no limit.
66
-#' @param max.iter The maximum number of iterations for the Baum-Welch algorithm. The default -1 is no limit.
67
-#' @param num.trials The number of trials to find a fit where state \code{most.frequent.state} is most frequent. Each time, the HMM is seeded with different random initial values.
68
-#' @param eps.try If code num.trials is set to greater than 1, \code{eps.try} is used for the trial runs. If unset, \code{eps} is used.
69
-#' @param num.threads Number of threads to use. Setting this to >1 may give increased performance.
70
-#' @param count.cutoff.quantile A quantile between 0 and 1. Should be near 1. Read counts above this quantile will be set to the read count specified by this quantile. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. Set \code{count.cutoff.quantile=1} in this case.
71
-#' @param strand Run the HMM only for the specified strand. One of \code{c('+', '-', '*')}.
72
-#' @param states A subset or all of \code{c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")}. This vector defines the states that are used in the Hidden Markov Model. The order of the entries should not be changed.
73
-#' @param most.frequent.state One of the states that were given in \code{states} or 'none'. The specified state is assumed to be the most frequent one. This can help the fitting procedure to converge into the correct fit. If set to 'none', no state is assumed to be most frequent.
74
-#' @return An \code{\link{aneuHMM}} object.
75
-univariate.findSCEs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), most.frequent.state="monosomy") {
76
-
77
-	### Define cleanup behaviour ###
78
-	on.exit(.C("R_univariate_cleanup"))
79
-
80
-	## Intercept user input
81
-	if (class(binned.data) != 'GRanges') {
82
-		binned.data <- get(load(binned.data))
83
-		if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'counts' or a file that contains such an object")
84
-	}
85
-	if (is.null(ID)) {
86
-		ID <- attr(binned.data, 'ID')
87
-	}
88
-	if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric")
89
-	if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer")
90
-	if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer")
91
-	if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer")
92
-	if (!is.null(eps.try)) {
93
-		if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric")
94
-	}
95
-	if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer")
96
-	if (check.strand(strand)!=0) stop("argument 'strand' expects either '+', '-' or '*'")
97
-	if (!most.frequent.state %in% c(states,'none')) stop("argument 'most.frequent.state' must be one of c(",paste(c(states,'none'), collapse=","),")")
98
-
99
-	warlist <- list()
100
-	if (is.null(eps.try) | num.trials==1) eps.try <- eps
101
-
102
-	## Assign variables
103
-	temp <- initializeStates(states)
104
-	state.labels <- temp$labels
105
-	state.distributions <- temp$distributions
106
-	dependent.states.mask <- state.labels %in% c("monosomy","disomy","trisomy","tetrasomy","multisomy")
107
-	numstates <- length(states)
108
-	numbins <- length(binned.data)
109
-	iniproc <- which(init==c("standard","random")) # transform to int
110
-	if (strand=='+') {
111
-		select <- 'pcounts'
112
-	} else if (strand=='-') {
113
-		select <- 'mcounts'
114
-	} else if (strand=='*') {
115
-		select <- 'counts'
116
-	}
117
-	counts <- mcols(binned.data)[,select]
118
-
119
-	### Make return object
120
-		result <- list()
121
-		class(result) <- class.univariate.hmm
122
-		result$ID <- ID
123
-		result$bins <- binned.data
124
-	## Quality info
125
-		qualityInfo <- list(shannon.entropy=qc.entropy(counts), spikyness=qc.spikyness(counts), complexity=attr(result$bins, 'complexity.preseqR'), bhattacharyya=NA)
126
-		result$qualityInfo <- qualityInfo
127
-
128
-	# Check if there are counts in the data, otherwise HMM will blow up
129
-	if (any(is.na(counts))) {
130
-		stop(paste0("ID = ",ID,": NAs found in reads."))
131
-	}
132
-	if (!any(counts!=0)) {
133
-		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done."))
134
-		result$warnings <- warlist
135
-		return(result)
136
-	} else if (any(counts<0)) {
137
-		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done."))
138
-		result$warnings <- warlist
139
-		return(result)
140
-	}
141
-		
142
-
143
-	# Filter high counts out, makes HMM faster
144
-	count.cutoff <- quantile(counts, count.cutoff.quantile)
145
-	names.count.cutoff <- names(count.cutoff)
146
-	count.cutoff <- ceiling(count.cutoff)
147
-	mask <- counts > count.cutoff
148
-	counts[mask] <- count.cutoff
149
-	numfiltered <- length(which(mask))
150
-	if (numfiltered > 0) {
151
-		message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to increase the speed of the HMM and should not affect the results.\n"))
152
-	}
153
-	
154
-	## Call univariate in a for loop to enable multiple trials
155
-	modellist <- list()
156
-	for (i_try in 1:num.trials) {
157
-		message(paste0("Trial ",i_try," / ",num.trials))
158
-
159
-		## Initial parameters
160
-		if (init == 'random') {
161
-			A.initial <- matrix(runif(numstates^2), ncol=numstates)
162
-			A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/")			
163
-			proba.initial <- runif(numstates)
164
-			# Distributions for dependent states
165
-			size.initial <- runif(1, min=0, max=100) * cumsum(dependent.states.mask)
166
-			prob.initial <- runif(1) * dependent.states.mask
167
-		} else if (init == 'standard') {
168
-			A.initial <- matrix(NA, ncol=numstates, nrow=numstates)
169
-			for (irow in 1:numstates) {
170
-				for (icol in 1:numstates) {
171
-					if (irow==icol) { A.initial[irow,icol] <- 0.9 }
172
-					else { A.initial[irow,icol] <- 0.1/(numstates-1) }
173
-				}
174
-			}
175
-			proba.initial <- rep(1/numstates, numstates)
176
-			mean.initial.monosomy <- mean(counts[counts>0])
177
-			var.initial.monosomy <- var(counts[counts>0])
178
-			if (is.na(mean.initial.monosomy)) {
179
-				mean.initial.monosomy <- 1
180
-			}
181
-			if (is.na(var.initial.monosomy)) {
182
-				var.initial.monosomy <- mean.initial.monosomy + 1
183
-			}
184
-			if (mean.initial.monosomy >= var.initial.monosomy) {
185
-				mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask)
186
-				var.initial <- (mean.initial.monosomy+1) * cumsum(dependent.states.mask)
187
-				size.initial <- rep(0,numstates)
188
-				prob.initial <- rep(0,numstates)
189
-				mask <- dependent.states.mask
190
-				size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
191
-				prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
192
-			} else {
193
-				mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask)
194
-				var.initial <- var.initial.monosomy * cumsum(dependent.states.mask)
195
-				size.initial <- rep(0,numstates)
196
-				prob.initial <- rep(0,numstates)
197
-				mask <- dependent.states.mask
198
-				size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
199
-				prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
200
-			}
201
-		}
202
-		# Assign initials for the nullsomy distribution
203