Browse code

added findSCEs

chakalakka authored on 28/04/2015 09:51:45
Showing 19 changed files

... ...
@@ -1,6 +1,7 @@
1 1
 # Generated by roxygen2 (4.1.0): do not edit by hand
2 2
 
3 3
 S3method(plot,GRanges)
4
+S3method(plot,aneuBiHMM)
4 5
 S3method(plot,aneuHMM)
5 6
 S3method(plot,character)
6 7
 export(bam2binned)
... ...
@@ -8,7 +9,9 @@ export(bed2binned)
8 9
 export(bedGraph2binned)
9 10
 export(exportCNVs)
10 11
 export(exportReadCounts)
12
+export(filterSegments)
11 13
 export(findCNVs)
14
+export(findSCEs)
12 15
 export(getChromLengths)
13 16
 export(heatmapAneuploidies)
14 17
 export(heatmapGenomeWide)
... ...
@@ -137,18 +137,8 @@ align2binned <- function(file, format, index=file, pairedEndReads=FALSE, chrom.l
137 137
 		classes[c(1:3,6)] <- classes.in.bed[c(1:3,6)]
138 138
 		data <- read.table(file, colClasses=classes)
139 139
 		# Convert to GRanges object
140
-		data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]+1), strand=Rle(strand("*"), nrow(data)))	# +1 to match coordinate systems
141
-		tC <- tryCatch({
142
-			seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))])
143
-		}, warning = function(war) {
144
-			suppressWarnings(seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]))
145
-			offending.chroms <- unique(names(which(end(data) > seqlengths(data)[as.character(seqnames(data))])))
146
-			if (war$message=="'ranges' contains values outside of sequence bounds. See ?trim to subset ranges.") {
147
-				warning(paste0("File \"",file,"\" contains reads outside of sequence bounds. The offending chromosomes were \"",paste(offending.chroms, collapse=', '),"\". Please consider using a different reference assembly."))
148
-			} else {
149
-				print(war)
150
-			}
151
-		})
140
+		data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]), strand=Rle(strand("*"), nrow(data)))	# start+1 to go from [0,x) -> [1,x]
141
+		seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))])
152 142
 		chroms.in.data <- seqlevels(data)
153 143
 	## BAM (1-based)
154 144
 	} else if (format == "bam") {
... ...
@@ -159,8 +149,11 @@ align2binned <- function(file, format, index=file, pairedEndReads=FALSE, chrom.l
159 149
 			chromosomes <- chroms.in.data
160 150
 		}
161 151
 		chroms2use <- intersect(chromosomes, chroms.in.data)
162
-		gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use),
163
-																ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use]))
152
+		if (length(chroms2use)==0) {
153
+			chrstring <- paste0(chromosomes, collapse=', ')
154
+			stop('The specified chromosomes ', chrstring, ' do not exist in the data.')
155
+		}
156
+		gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use), ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use]))
164 157
 		if (calc.complexity || !remove.duplicate.reads) {
165 158
 			if (pairedEndReads) {
166 159
 				data <- GenomicAlignments::readGAlignmentPairsFromBam(file, index=index, param=ScanBamParam(which=range(gr)))
... ...
@@ -202,12 +195,12 @@ align2binned <- function(file, format, index=file, pairedEndReads=FALSE, chrom.l
202 195
 	diff <- setdiff(chromosomes, chroms.in.data)
203 196
 	if (length(diff)>0) {
204 197
 		diffs <- paste0(diff, collapse=', ')
205
-		warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data'))
198
+		warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.'))
206 199
 	}
207 200
 	diff <- setdiff(chromosomes, names(chrom.lengths))
208 201
 	if (length(diff)>0) {
209 202
 		diffs <- paste0(diff, collapse=', ')
210
-		warning(paste0('Not using chromosomes ', diffs, ' because no lengths could be found'))
203
+		warning(paste0('Not using chromosomes ', diffs, ' because no lengths could be found.'))
211 204
 	}
212 205
 	chroms2use <- intersect(chromosomes, chroms.in.data)
213 206
 	chroms2use <- intersect(chroms2use, names(chrom.lengths))
... ...
@@ -282,11 +275,13 @@ align2binned <- function(file, format, index=file, pairedEndReads=FALSE, chrom.l
282 275
 
283 276
 		## Complexity estimation with preseqR
284 277
 		complexity.preseqR.fit <- preseqR::preseqR.rfa.curve(multiplicity[['1']])
285
-		complexity.preseqR <- as.numeric(preseqR::preseqR.rfa.estimate(complexity.preseqR.fit$continued.fraction, 1e100) + total.reads.unique['1'])
286
-		if (!is.null(complexity.preseqR.fit)) {
287
-			complexity.preseqR.ggplt <- ggplot(as.data.frame(complexity.preseqR.fit$estimates)) + geom_line(aes_string(x='sample.size', y='yield.estimate')) + geom_point(data=df, aes_string(x='x', y='y'), size=5) + ggtitle('preseqR complexity estimation') + theme_bw() + xlab('total number of reads') + ylab('unique reads')
288
-		} else {
278
+		if (is.null(complexity.preseqR.fit)) {
279
+			complexity.preseqR <- NA
280
+			complexity.preseqR.ggplt <- NA
289 281
 			warning("Complexity estimation with preseqR failed.")
282
+		} else {
283
+			complexity.preseqR <- as.numeric(preseqR::preseqR.rfa.estimate(complexity.preseqR.fit$continued.fraction, 1e100) + total.reads.unique['1'])
284
+			complexity.preseqR.ggplt <- ggplot(as.data.frame(complexity.preseqR.fit$estimates)) + geom_line(aes_string(x='sample.size', y='yield.estimate')) + geom_point(data=df, aes_string(x='x', y='y'), size=5) + ggtitle('preseqR complexity estimation') + theme_bw() + xlab('total number of reads') + ylab('unique reads')
290 285
 		}
291 286
 
292 287
 		## Complexity estimation with Michaelis-Menten
... ...
@@ -1,6 +1,6 @@
1 1
 #' Find copy number variations
2 2
 #'
3
-#' \code{findCNVs} classifies the input binned read counts into several states which represent copy-number-variation.
3
+#' \code{findCNVs} classifies the binned read counts into several states which represent copy-number-variation.
4 4
 #'
5 5
 #' \code{findCNVs} uses a 6-state Hidden Markov Model to classify the binned read counts: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper
6 6
 #' @author Aaron Taudt
... ...
@@ -42,7 +42,7 @@ findCNVs <- function(binned.data, ID, method='univariate', eps=0.001, init="stan
42 42
 
43 43
 #' Find copy number variations (univariate)
44 44
 #'
45
-#' \code{findCNVs} classifies the input binned read counts into several states which represent copy-number-variation.
45
+#' \code{findCNVs} classifies the binned read counts into several states which represent copy-number-variation.
46 46
 #'
47 47
 #' @param binned.data A \link{GRanges} object with binned read counts.
48 48
 #' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.
... ...
@@ -149,10 +149,9 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max
149 149
 			A.initial <- matrix(runif(numstates^2), ncol=numstates)
150 150
 			A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/")			
151 151
 			proba.initial <- runif(numstates)
152
-			size.initial <- runif(1, min=0, max=100) * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F))
153
-			prob.initial <- runif(1) * ifelse(state.distributions=='dnbinom', T, F)
154
-			prob.initial[state.distributions=='dgeom'] <- runif(1)
155
-			lambda.initial <- runif(1, min=0, max=100) * cumsum(ifelse(state.distributions=='dpois', T, F))
152
+			# Distributions for dependent states
153
+			size.initial <- runif(1, min=0, max=100) * cumsum(dependent.states.mask)
154
+			prob.initial <- runif(1) * dependent.states.mask
156 155
 		} else if (init == 'standard') {
157 156
 			A.initial <- matrix(NA, ncol=numstates, nrow=numstates)
158 157
 			for (irow in 1:numstates) {
... ...
@@ -162,18 +161,13 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max
162 161
 				}
163 162
 			}
164 163
 			proba.initial <- rep(1/numstates, numstates)
165
-			mean.initial <- mean(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F))
166
-			var.initial <- var(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F))
164
+			mean.initial <- mean(reads[reads>0])/2 * cumsum(dependent.states.mask)
165
+			var.initial <- var(reads[reads>0])/2 * cumsum(dependent.states.mask)
167 166
 			size.initial <- rep(0,numstates)
168 167
 			prob.initial <- rep(0,numstates)
169
-			mask <- state.distributions=='dnbinom'
168
+			mask <- dependent.states.mask
170 169
 			size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
171 170
 			prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
172
-			mask <- state.distributions=='dbinom'
173
-			size.initial[mask] <- dbinom.size(mean.initial[mask], var.initial[mask])
174
-			prob.initial[mask] <- dbinom.prob(mean.initial[mask], var.initial[mask])
175
-			prob.initial[state.distributions=='dgeom'] <- 0.9
176
-			lambda.initial <- mean(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dpois', T, F))
177 171
 		}
178 172
 	
179 173
 		hmm <- .C("R_univariate_hmm",
... ...
@@ -183,7 +177,6 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max
183 177
 			state.labels = as.integer(state.labels), # int* state_labels
184 178
 			size = double(length=numstates), # double* size
185 179
 			prob = double(length=numstates), # double* prob
186
-			lambda = double(length=numstates), # double* lambda
187 180
 			num.iterations = as.integer(max.iter), #  int* maxiter
188 181
 			time.sec = as.integer(max.time), # double* maxtime
189 182
 			loglik.delta = as.double(eps.try), # double* eps
... ...
@@ -195,7 +188,6 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max
195 188
 			distr.type = as.integer(state.distributions), # int* distr_type
196 189
 			size.initial = as.vector(size.initial), # double* initial_size
197 190
 			prob.initial = as.vector(prob.initial), # double* initial_prob
198
-			lambda.initial = as.vector(lambda.initial), # double* initial_lambda
199 191
 			A.initial = as.vector(A.initial), # double* initial_A
200 192
 			proba.initial = as.vector(proba.initial), # double* initial_proba
201 193
 			use.initial.params = as.logical(1), # bool* use_initial_params
... ...
@@ -242,7 +234,6 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max
242 234
 			state.labels = as.integer(state.labels), # int* state_labels
243 235
 			size = double(length=numstates), # double* size
244 236
 			prob = double(length=numstates), # double* prob
245
-			lambda = double(length=numstates), # double* lambda
246 237
 			num.iterations = as.integer(max.iter), #  int* maxiter
247 238
 			time.sec = as.integer(max.time), # double* maxtime
248 239
 			loglik.delta = as.double(eps), # double* eps
... ...
@@ -254,7 +245,6 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max
254 245
 			distr.type = as.integer(state.distributions), # int* distr_type
255 246
 			size.initial = as.vector(hmm$size), # double* initial_size
256 247
 			prob.initial = as.vector(hmm$prob), # double* initial_prob
257
-			lambda.initial = as.vector(hmm$lambda), # double* initial_lambda
258 248
 			A.initial = as.vector(hmm$A), # double* initial_A
259 249
 			proba.initial = as.vector(hmm$proba), # double* initial_proba
260 250
 			use.initial.params = as.logical(1), # bool* use_initial_params
... ...
@@ -310,20 +300,17 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max
310 300
 				for (idistr in 1:length(hmm$distr.type)) {
311 301
 					distr <- levels(state.distributions)[hmm$distr.type[idistr]]
312 302
 					if (distr == 'dnbinom') {
313
-						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
314
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
303
+						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
304
+						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
315 305
 					} else if (distr == 'dgeom') {
316
-						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], lambda=NA, mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr])))
317
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], lambda=NA, mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr])))
306
+						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr])))
307
+						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr])))
318 308
 					} else if (distr == 'delta') {
319
-						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0))
320
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0))
321
-					} else if (distr == 'dpois') {
322
-						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda[idistr], mu=hmm$lambda[idistr], variance=hmm$lambda[idistr]))
323
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda.initial[idistr], mu=hmm$lambda.initial[idistr], variance=hmm$lambda.initial[idistr]))
309
+						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0))
310
+						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0))
324 311
 					} else if (distr == 'dbinom') {
325
-						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
326
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
312
+						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
313
+						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
327 314
 					}
328 315
 				}
329 316
 				rownames(distributions) <- state.labels
... ...
@@ -426,7 +413,7 @@ bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.
426 413
 
427 414
 		## Pre-compute z-values for each number of reads
428 415
 		message("Computing pre z-matrix...", appendLF=F)
429
-		z.per.read <- array(NA, dim=c(maxreads+1, num.models, num.uni.states))
416
+		z.per.read <- array(NA, dim=c(maxreads+1, num.models, num.uni.states), dimnames=list(reads=0:maxreads, model=c('+','-'), state=state.labels))
430 417
 		xreads <- 0:maxreads
431 418
 		for (istrand in 1:num.models) {
432 419
 			model <- models[[istrand]]
... ...
@@ -437,6 +424,9 @@ bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.
437 424
 					u <- pnbinom(xreads, size, prob)
438 425
 				} else if (model$distributions[istate,'type']=='delta') {
439 426
 					u <- rep(1, length(xreads))
427
+				} else if (model$distributions[istate,'type']=='dgeom') {
428
+					prob <- model$distributions[istate,'prob']
429
+					u <- pgeom(xreads, prob)
440 430
 				}
441 431
 				qnorm_u <- qnorm(u)
442 432
 				mask <- qnorm_u==Inf
... ...
@@ -478,7 +468,7 @@ bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.
478 468
 				correlationMatrix[,,state] <- cor(z.temp)
479 469
 				determinant[state] <- det( correlationMatrix[,,state] )
480 470
 				correlationMatrixInverse[,,state] <- solve(correlationMatrix[,,state])
481
-				if (length(z.temp)>0) {
471
+				if (!any(is.na(correlationMatrixInverse[,,state]))) {
482 472
 					usestateTF[state] <- TRUE
483 473
 				} else {
484 474
 					usestateTF[state] <- FALSE
485 475
new file mode 100644
... ...
@@ -0,0 +1,792 @@
1
+#' Find sister chromatid exchanges
2
+#'
3
+#' \code{findSCEs} classifies the binned read counts into several states which represent the number of chromatids on each strand.
4
+#'
5
+#' \code{findSCEs} uses a 6-state Hidden Markov Model to classify the binned read counts: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper
6
+#' @author Aaron Taudt
7
+#' @inheritParams univariate.findSCEs
8
+#' @return An \code{\link{aneuBiHMM}} object.
9
+#' @examples
10
+#'## Get an example BAM file with single-cell-sequencing reads
11
+#'bamfile <- system.file("extdata/BB140820_I_002.bam", package="aneufinder")
12
+#'## Bin the BAM file into bin size 200000bp
13
+#'binned.data <- bam2binned(bamfile, binsize=200000, chromosomes=c(1:22,'X','Y'), GC.correction=FALSE,
14
+#'                          save.as.RData=FALSE)
15
+#'## Fit the Hidden Markov Model
16
+#'model <- findSCEs(binned.data, ID=basename(bamfile), eps=0.1, max.time=60)
17
+#'## Check the fit
18
+#'plot(model, type='histogram')
19
+#' @export
20
+findSCEs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=10, eps.try=10*eps, num.threads=1, read.cutoff.quantile=0.999, GC.correction=TRUE, strand='*', allow.odd.states=FALSE) {
21
+
22
+	call <- match.call()
23
+	underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='')
24
+	message("\n",call[[1]],"():")
25
+	message(underline)
26
+	ptm <- proc.time()
27
+	message("Find CNVs for ID = ",ID, ":")
28
+
29
+	model <- bivariate.findSCEs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff.quantile=read.cutoff.quantile, GC.correction=GC.correction, allow.odd.states=allow.odd.states)
30
+
31
+	attr(model, 'call') <- call
32
+	time <- proc.time() - ptm
33
+	message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s")
34
+	return(model)
35
+
36
+}
37
+
38
+
39
+#' Find copy number variations (univariate)
40
+#'
41
+#' \code{findSCEs} classifies the input binned read counts into several states which represent copy-number-variation.
42
+#'
43
+#' @param binned.data A \link{GRanges} object with binned read counts.
44
+#' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.
45
+#' @param eps Convergence threshold for the Baum-Welch algorithm.
46
+#' @param init One of the following initialization procedures:
47
+#'	\describe{
48
+#'		\item{\code{standard}}{The negative binomial of state 'disomy' will be initialized with \code{mean=mean(reads)}, \code{var=var(reads)}. This procedure usually gives good convergence.}
49
+#'		\item{\code{random}}{Mean and variance of the negative binomial of state 'disomy' will be initialized with random values (in certain boundaries, see source code). Try this if the \code{standard} procedure fails to produce a good fit.}
50
+#'	}
51
+#' @param max.time The maximum running time in seconds for the Baum-Welch algorithm. If this time is reached, the Baum-Welch will terminate after the current iteration finishes. The default -1 is no limit.
52
+#' @param max.iter The maximum number of iterations for the Baum-Welch algorithm. The default -1 is no limit.
53
+#' @param num.trials The number of trials to find a fit where state 'disomic' is most frequent. Each time, the HMM is seeded with different random initial values.
54
+#' @param eps.try If code num.trials is set to greater than 1, \code{eps.try} is used for the trial runs. If unset, \code{eps} is used.
55
+#' @param num.threads Number of threads to use. Setting this to >1 may give increased performance.
56
+#' @param read.cutoff.quantile A quantile between 0 and 1. Should be near 1. Read counts above this quantile will be set to the read count specified by this quantile. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. Set \code{read.cutoff.quantile=1} in this case.
57
+#' @param GC.correction Either \code{TRUE} or \code{FALSE}. If \code{GC.correction=TRUE}, the GC corrected reads have to be present in the input \code{binned.data}, otherwise a warning is thrown and no GC correction is done.
58
+#' @param strand Run the HMM only for the specified strand. One of \code{c('+', '-', '*')}.
59
+#' @return An \code{\link{aneuHMM}} object.
60
+univariate.findSCEs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, read.cutoff.quantile=0.999, GC.correction=TRUE, strand='*') {
61
+
62
+	### Define cleanup behaviour ###
63
+	on.exit(.C("R_univariate_cleanup"))
64
+
65
+	## Intercept user input
66
+	IDcheck <- ID  #trigger error if not defined
67
+	if (class(binned.data) != 'GRanges') {
68
+		binned.data <- get(load(binned.data))
69
+		if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'reads' or a file that contains such an object")
70
+	}
71
+	if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric")
72
+	if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer")
73
+	if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer")
74
+	if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer")
75
+	if (!is.null(eps.try)) {
76
+		if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric")
77
+	}
78
+	if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer")
79
+	if (check.logical(GC.correction)!=0) stop("argument 'GC.correction' expects a logical (TRUE or FALSE)")
80
+	if (check.strand(strand)!=0) stop("argument 'strand' expects either '+', '-' or '*'")
81
+
82
+	warlist <- list()
83
+	if (is.null(eps.try)) eps.try <- eps
84
+
85
+	## Assign variables
86
+# 	state.labels.SCE # assigned in global.R
87
+	numstates <- length(state.labels.SCE)
88
+	numbins <- length(binned.data)
89
+	iniproc <- which(init==c("standard","random")) # transform to int
90
+	if (strand=='+') {
91
+		select <- 'preads'
92
+	} else if (strand=='-') {
93
+		select <- 'mreads'
94
+	} else if (strand=='*') {
95
+		select <- 'reads'
96
+	}
97
+	if (GC.correction) {
98
+		if (!(paste0(select,'.gc') %in% names(mcols(binned.data)))) {
99
+			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because they are not in the binned data. Continuing without GC-correction."))
100
+		} else if (any(is.na(mcols(binned.data)[,paste0(select,'.gc')]))) {
101
+			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because there are NAs. GC-correction may not be reliable. Continuing without GC-correction."))
102
+		} else if (any(mcols(binned.data)[,paste0(select,'.gc')]<0)) {
103
+			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because there are negative values. GC-correction is not reliable. Continuing without GC-correction."))
104
+		} else {
105
+			select <- paste0(select,'.gc')
106
+		}
107
+	}
108
+	reads <- mcols(binned.data)[,select]
109
+
110
+	## Make return object
111
+	result <- list()
112
+	class(result) <- class.univariate.hmm
113
+	result$ID <- ID
114
+	result$bins <- binned.data
115
+
116
+	# Check if there are reads in the data, otherwise HMM will blow up
117
+	if (!any(reads!=0)) {
118
+		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All reads in data are zero. No HMM done."))
119
+		result$warnings <- warlist
120
+		return(result)
121
+	} else if (any(reads<0)) {
122
+		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some reads in data are negative. No HMM done."))
123
+		result$warnings <- warlist
124
+		return(result)
125
+	}
126
+		
127
+
128
+	# Filter high reads out, makes HMM faster
129
+	read.cutoff <- quantile(reads, read.cutoff.quantile)
130
+	names.read.cutoff <- names(read.cutoff)
131
+	read.cutoff <- ceiling(read.cutoff)
132
+	mask <- reads > read.cutoff
133
+	reads[mask] <- read.cutoff
134
+	numfiltered <- length(which(mask))
135
+	if (numfiltered > 0) {
136
+		message(paste0("Replaced read counts > ",read.cutoff," (",names.read.cutoff," quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'read.cutoff.quantile=1' to disable this filtering. This filtering was done to increase the speed of the HMM and should not affect the results.\n"))
137
+	}
138
+	
139
+	## Call univariate in a for loop to enable multiple trials
140
+	modellist <- list()
141
+	for (i_try in 1:num.trials) {
142
+		message(paste0("Trial ",i_try," / ",num.trials))
143
+
144
+		## Initial parameters
145
+		if (init == 'random') {
146
+			A.initial <- matrix(runif(numstates^2), ncol=numstates)
147
+			A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/")			
148
+			proba.initial <- runif(numstates)
149
+			# Distributions for dependent states
150
+			size.initial <- runif(1, min=0, max=100) * cumsum(dependent.states.mask.SCE)
151
+			prob.initial <- runif(1) * dependent.states.mask.SCE
152
+		} else if (init == 'standard') {
153
+			A.initial <- matrix(NA, ncol=numstates, nrow=numstates)
154
+			for (irow in 1:numstates) {
155
+				for (icol in 1:numstates) {
156
+					if (irow==icol) { A.initial[irow,icol] <- 0.9 }
157
+					else { A.initial[irow,icol] <- 0.1/(numstates-1) }
158
+				}
159
+			}
160
+			proba.initial <- rep(1/numstates, numstates)
161
+			mean.initial <- mean(reads[reads>0]) * cumsum(dependent.states.mask.SCE)
162
+			var.initial <- var(reads[reads>0]) * cumsum(dependent.states.mask.SCE)
163
+			size.initial <- rep(0,numstates)
164
+			prob.initial <- rep(0,numstates)
165
+			mask <- dependent.states.mask.SCE
166
+			size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
167
+			prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
168
+		}
169
+		# Assign initials for the nullsomy distribution
170
+		size.initial[2] <- 1
171
+		prob.initial[2] <- 0.5
172
+	
173
+		hmm <- .C("R_univariate_hmm",
174
+			reads = as.integer(reads), # int* O
175
+			num.bins = as.integer(numbins), # int* T
176
+			num.states = as.integer(numstates), # int* N
177
+			state.labels.SCE = as.integer(state.labels.SCE), # int* state.labels.SCE
178
+			size = double(length=numstates), # double* size
179
+			prob = double(length=numstates), # double* prob
180
+			num.iterations = as.integer(max.iter), #  int* maxiter
181
+			time.sec = as.integer(max.time), # double* maxtime
182
+			loglik.delta = as.double(eps.try), # double* eps
183
+			states = integer(length=numbins), # int* states
184
+			A = double(length=numstates*numstates), # double* A
185
+			proba = double(length=numstates), # double* proba
186
+			loglik = double(length=1), # double* loglik
187
+			weights = double(length=numstates), # double* weights
188
+			distr.type = as.integer(state.distributions.SCE), # int* distr_type
189
+			size.initial = as.vector(size.initial), # double* initial_size
190
+			prob.initial = as.vector(prob.initial), # double* initial_prob
191
+			A.initial = as.vector(A.initial), # double* initial_A
192
+			proba.initial = as.vector(proba.initial), # double* initial_proba
193
+			use.initial.params = as.logical(1), # bool* use_initial_params
194
+			num.threads = as.integer(num.threads), # int* num_threads
195
+			error = as.integer(0), # int* error (error handling)
196
+			read.cutoff = as.integer(read.cutoff) # int* read_cutoff
197
+		)
198
+
199
+		hmm$eps <- eps.try
200
+		if (num.trials > 1) {
201
+			if (hmm$loglik.delta > hmm$eps) {
202
+				warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge in trial run ",i_try,"!\n"))
203
+			}
204
+			# Store model in list
205
+			hmm$reads <- NULL
206
+			modellist[[i_try]] <- hmm
207
+			# Check if monosomic is more frequent than trisomic
208
+			imono <- which(state.labels.SCE=='monosomy')
209
+			itri <- which(state.labels.SCE=='trisomy')
210
+			if (hmm$weights[imono]>hmm$weights[itri]) {
211
+				break
212
+			}
213
+			init <- 'random'
214
+		} else if (num.trials == 1) {
215
+			if (hmm$loglik.delta > eps) {
216
+				warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge!\n"))
217
+			}
218
+		}
219
+	}
220
+
221
+	if (num.trials > 1) {
222
+
223
+		# Select fit with highest weight in state disomy
224
+		# Mathematically we should select the fit with highest loglikelihood. If we think the fit with the highest loglikelihood is incorrect, we should change the underlying model. However, this is very complex and we choose to select a fit that we think is (more) correct, although it has not the highest support given our (imperfect) model.
225
+		istate <- which(state.labels.SCE=='disomy')
226
+		indexmax <- which.max(unlist(lapply(lapply(modellist,'[[','weights'), '[', istate)))
227
+		hmm <- modellist[[indexmax]]
228
+
229
+		# Rerun the HMM with different epsilon and initial parameters from trial run
230
+		message(paste0("Rerunning trial ",indexmax," with eps = ",eps))
231
+		hmm <- .C("R_univariate_hmm",
232
+			reads = as.integer(reads), # int* O
233
+			num.bins = as.integer(numbins), # int* T
234
+			num.states = as.integer(numstates), # int* N
235
+			state.labels.SCE = as.integer(state.labels.SCE), # int* state.labels.SCE
236
+			size = double(length=numstates), # double* size
237
+			prob = double(length=numstates), # double* prob
238
+			num.iterations = as.integer(max.iter), #  int* maxiter
239
+			time.sec = as.integer(max.time), # double* maxtime
240
+			loglik.delta = as.double(eps), # double* eps
241
+			states = integer(length=numbins), # int* states
242
+			A = double(length=numstates*numstates), # double* A
243
+			proba = double(length=numstates), # double* proba
244
+			loglik = double(length=1), # double* loglik
245
+			weights = double(length=numstates), # double* weights
246
+			distr.type = as.integer(state.distributions.SCE), # int* distr_type
247
+			size.initial = as.vector(hmm$size), # double* initial_size
248
+			prob.initial = as.vector(hmm$prob), # double* initial_prob
249
+			A.initial = as.vector(hmm$A), # double* initial_A
250
+			proba.initial = as.vector(hmm$proba), # double* initial_proba
251
+			use.initial.params = as.logical(1), # bool* use_initial_params
252
+			num.threads = as.integer(num.threads), # int* num_threads
253
+			error = as.integer(0), # int* error (error handling)
254
+			read.cutoff = as.integer(read.cutoff) # int* read_cutoff
255
+		)
256
+	}
257
+
258
+	### Make return object ###
259
+	## Check for errors
260
+		if (hmm$error == 0) {
261
+		## Bin coordinates and states ###
262
+			result$bins$state <- state.labels.SCE[hmm$states]
263
+		## Segmentation
264
+			message("Making segmentation ...", appendLF=F)
265
+			ptm <- proc.time()
266
+			gr <- result$bins
267
+			red.gr.list <- GRangesList()
268
+			for (state in state.labels.SCE) {
269
+				red.gr <- GenomicRanges::reduce(gr[gr$state==state])
270
+				mcols(red.gr)$state <- rep(factor(state, levels=levels(state.labels.SCE)),length(red.gr))
271
+				if (length(red.gr) > 0) {
272
+					red.gr.list[[length(red.gr.list)+1]] <- red.gr
273
+				}
274
+			}
275
+			red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list))
276
+			result$segments <- red.gr
277
+			seqlengths(result$segments) <- seqlengths(binned.data)
278
+			time <- proc.time() - ptm
279
+			message(" ",round(time[3],2),"s")
280
+		## Parameters
281
+			# Weights
282
+			result$weights <- hmm$weights
283
+			names(result$weights) <- state.labels.SCE
284
+			# Transition matrices
285
+			transitionProbs <- matrix(hmm$A, ncol=hmm$num.states)
286
+			rownames(transitionProbs) <- state.labels.SCE
287
+			colnames(transitionProbs) <- state.labels.SCE
288
+			result$transitionProbs <- transitionProbs
289
+			transitionProbs.initial <- matrix(hmm$A.initial, ncol=hmm$num.states)
290
+			rownames(transitionProbs.initial) <- state.labels.SCE
291
+			colnames(transitionProbs.initial) <- state.labels.SCE
292
+			result$transitionProbs.initial <- transitionProbs.initial
293
+			# Initial probs
294
+			result$startProbs <- hmm$proba
295
+			names(result$startProbs) <- paste0("P(",state.labels.SCE,")")
296
+			result$startProbs.initial <- hmm$proba.initial
297
+			names(result$startProbs.initial) <- paste0("P(",state.labels.SCE,")")
298
+			# Distributions
299
+				distributions <- data.frame()
300
+				distributions.initial <- data.frame()
301
+				for (idistr in 1:length(hmm$distr.type)) {
302
+					distr <- levels(state.distributions.SCE)[hmm$distr.type[idistr]]
303
+					if (distr == 'dnbinom') {
304
+						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
305
+						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
306
+					} else if (distr == 'dgeom') {
307
+						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr])))
308
+						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr])))
309
+					} else if (distr == 'delta') {
310
+						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0))
311
+						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0))
312
+					} else if (distr == 'dbinom') {
313
+						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
314
+						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
315
+					}
316
+				}
317
+				rownames(distributions) <- state.labels.SCE
318
+				rownames(distributions.initial) <- state.labels.SCE
319
+				result$distributions <- distributions
320
+				result$distributions.initial <- distributions
321
+		## Convergence info
322
+			convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error)
323
+			result$convergenceInfo <- convergenceInfo
324
+		## GC correction
325
+			result$GC.correction <- grepl('gc', select)
326
+		## Quality info
327
+			qualityInfo <- list(shannon.entropy=qc.entropy(reads), spikyness=qc.spikyness(reads), complexity=attr(result$bins, 'complexity.preseqR'))
328
+			result$qualityInfo <- qualityInfo
329
+		} else if (hmm$error == 1) {
330
+			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'read.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
331
+		} else if (hmm$error == 2) {
332
+			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
333
+		}
334
+
335
+	## Issue warnings
336
+	result$warnings <- warlist
337
+
338
+	## Return results
339
+	return(result)
340
+}
341
+
342
+
343
+bivariate.findSCEs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, read.cutoff.quantile=0.999, GC.correction=TRUE, allow.odd.states=FALSE) {
344
+
345
+	## Debugging
346
+# 	ID = 'test'
347
+# 	eps = 1
348
+# 	init='standard'
349
+# 	max.time=-1
350
+# 	max.iter=-1
351
+# 	num.trials=1
352
+# 	eps.try=NULL
353
+# 	num.threads=1
354
+# 	read.cutoff.quantile=0.999
355
+# 	GC.correction=FALSE
356
+
357
+	## Intercept user input
358
+	IDcheck <- ID  #trigger error if not defined
359
+	if (class(binned.data) != 'GRanges') {
360
+		binned.data <- get(load(binned.data))
361
+		if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'reads' or a file that contains such an object")
362
+	}
363
+	if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric")
364
+	if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer")
365
+	if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer")
366
+	if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer")
367
+	if (!is.null(eps.try)) {
368
+		if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric")
369
+	}
370
+	if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer")
371
+	if (check.logical(GC.correction)!=0) stop("argument 'GC.correction' expects a logical (TRUE or FALSE)")
372
+
373
+	war <- NULL
374
+	if (is.null(eps.try)) eps.try <- eps
375
+
376
+	### Stack the strands and run one univariate findSCEs
377
+	message("")
378
+	message(paste(rep('-',getOption('width')), collapse=''))
379
+	message("Running univariate")
380
+	binned.data.minus <- binned.data
381
+	strand(binned.data.minus) <- '-'
382
+	binned.data.minus$reads <- binned.data.minus$mreads
383
+	binned.data.minus$reads.gc <- binned.data.minus$mreads.gc
384
+	binned.data.plus <- binned.data
385
+	strand(binned.data.plus) <- '+'
386
+	binned.data.plus$reads <- binned.data.plus$preads
387
+	binned.data.plus$reads.gc <- binned.data.plus$preads.gc
388
+	binned.data.stacked <- c(binned.data.minus, binned.data.plus)
389
+	mask.attributes <- c(grep('complexity', names(attributes(binned.data)), value=T), 'spikyness', 'shannon.entropy')
390
+	attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes]
391
+
392
+	model.stacked <- univariate.findSCEs(binned.data.stacked, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff.quantile=read.cutoff.quantile, GC.correction=GC.correction)
393
+	model.minus <- model.stacked
394
+	model.minus$bins <- model.minus$bins[strand(model.minus$bins)=='-']
395
+	model.minus$segments <- model.minus$segments[strand(model.minus$segments)=='-']
396
+	model.plus <- model.stacked
397
+	model.plus$bins <- model.plus$bins[strand(model.plus$bins)=='+']
398
+	model.plus$segments <- model.plus$segments[strand(model.plus$segments)=='+']
399
+
400
+	models <- list(minus=model.minus, plus=model.plus)
401
+
402
+
403
+# 	### Split into strands and run univariate findSCEs
404
+# 	message("")
405
+# 	message(paste(rep('-',getOption('width')), collapse=''))
406
+# 	message("Running '-'-strand")
407
+# 	minus.model <- univariate.findSCEs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff.quantile=read.cutoff.quantile, GC.correction=GC.correction, strand='-')
408
+# 	message("")
409
+# 	message(paste(rep('-',getOption('width')), collapse=''))
410
+# 	message("Running '+'-strand")
411
+# 	plus.model <- univariate.findSCEs(binned.data, ID, eps=10*eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff.quantile=read.cutoff.quantile, GC.correction=GC.correction, strand='+')
412
+# 	models <- list(plus=plus.model, minus=minus.model)
413
+
414
+	### Prepare the multivariate HMM
415
+	message("")
416
+	message(paste(rep('-',getOption('width')), collapse=''))
417
+	message("Preparing bivariate HMM\n")
418
+
419
+		## Extract reads and other stuff
420
+		distributions <- lapply(models, '[[', 'distributions')
421
+		weights <- lapply(models, '[[', 'weights')
422
+		num.uni.states <- length(models[[1]]$weights)
423
+		num.models <- length(models)
424
+		num.bins <- length(models[[1]]$bins)
425
+		comb.states <- vector()
426
+		levels.state <- levels(models[[1]]$bins$state)
427
+		for (i1 in 1:length(levels.state)) {
428
+			for (i2 in 1:length(levels.state)) {
429
+				comb.state <- paste(levels.state[i1], levels.state[i2])
430
+				if (allow.odd.states) {
431
+					comb.states[length(comb.states)+1] <- comb.state
432
+				} else {
433
+					state.multiplicity <- multiplicity.SCE[levels.state[i1]] + multiplicity.SCE[levels.state[i2]]
434
+					# Only even states and null-mono states (for sex chromosomes)
435
+					if (state.multiplicity %% 2 == 0 | state.multiplicity == 1) {
436
+						comb.states[length(comb.states)+1] <- comb.state
437
+					}
438
+				}
439
+			}
440
+		}
441
+		comb.states <- factor(comb.states, levels=comb.states)
442
+		states.list <- list()
443
+		for (model in models) { states.list[[length(states.list)+1]] <- model$bins$state }
444
+		comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states))
445
+		num.comb.states <- length(comb.states)
446
+		select <- 'reads'
447
+		if (GC.correction) {
448
+			if (paste0(select,'.gc') %in% names(mcols(binned.data))) {
449
+				select <- paste0(select,'.gc')
450
+			} else {
451
+				warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because they are not in the binned data. Continuing without GC-correction."))
452
+			}
453
+		}
454
+		reads <- matrix(c(mcols(models$minus$bins)[,paste0('m',select)], mcols(models$plus$bins)[,paste0('p',select)]), ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models)))
455
+		maxreads <- max(reads)
456
+
457
+		## Pre-compute z-values for each number of reads
458
+		message("Computing pre z-matrix...", appendLF=F)
459
+		z.per.read <- array(NA, dim=c(maxreads+1, num.models, num.uni.states), dimnames=list(reads=0:maxreads, strand=names(models), state=levels(models[[1]]$bins$state)))
460
+		xreads <- 0:maxreads
461
+		for (istrand in 1:num.models) {
462
+			model <- models[[istrand]]
463
+			for (istate in 1:num.uni.states) {
464
+				if (model$distributions[istate,'type']=='dnbinom') {
465
+					size <- model$distributions[istate,'size']
466
+					prob <- model$distributions[istate,'prob']
467
+					u <- pnbinom(xreads, size, prob)
468
+				} else if (model$distributions[istate,'type']=='delta') {
469
+					u <- rep(1, length(xreads))
470
+				} else if (model$distributions[istate,'type']=='dgeom') {
471
+					prob <- model$distributions[istate,'prob']
472
+					u <- pgeom(xreads, prob)
473
+				}
474
+				qnorm_u <- qnorm(u)
475
+				mask <- qnorm_u==Inf
476
+				qnorm_u[mask] <- qnorm(1-1e-16)
477
+				z.per.read[ , istrand, istate] <- qnorm_u
478
+			}
479
+		}
480
+		message(" done")
481
+
482
+		## Compute the z matrix
483
+		message("Transfering values into z-matrix...", appendLF=F)
484
+		z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(models), state=levels(models[[1]]$bins$state)))
485
+		for (istrand in 1:num.models) {
486
+			model <- models[[istrand]]
487
+			for (istate in 1:num.uni.states) {
488
+				z.per.bin[ , istrand, istate] = z.per.read[reads[,istrand]+1, istrand, istate]
489
+			}
490
+		}
491
+		remove(z.per.read)
492
+		message(" done")
493
+
494
+		## Calculate correlation matrix
495
+		message("Computing inverse of correlation matrix...", appendLF=F)
496
+		correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(models), strand=names(models), comb.state=comb.states))
497
+		correlationMatrixInverse <- correlationMatrix
498
+		determinant <- rep(0, num.comb.states)
499
+		names(determinant) <- comb.states
500
+		usestateTF <- rep(NA, num.comb.states) # TRUE, FALSE vector for usable states
501
+		names(usestateTF) <- comb.states
502
+		for (comb.state in comb.states) {
503
+			state <- strsplit(comb.state, ' ')[[1]]
504
+			mask <- which(comb.states.per.bin==comb.state)
505
+			# Subselect z
506
+			z.temp <- matrix(NA, ncol=num.models, nrow=length(mask))
507
+			for (istrand in 1:num.models) {
508
+				z.temp[,istrand] <- z.per.bin[mask, istrand, state[istrand]]
509
+			}
510
+			temp <- tryCatch({
511
+				if (nrow(z.temp) > 1) {
512
+					correlationMatrix[,,comb.state] <- cor(z.temp)
513
+					determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
514
+					correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
515
+					usestateTF[comb.state] <- TRUE
516
+				} else {
517
+					correlationMatrix[,,comb.state] <- diag(num.models)
518
+					determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
519
+					correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
520
+					usestateTF[comb.state] <- TRUE
521
+				}
522
+			}, warning = function(war) {
523
+				correlationMatrix[,,comb.state] <<- diag(num.models)
524
+				determinant[comb.state] <<- det( correlationMatrix[,,comb.state] )
525
+				correlationMatrixInverse[,,comb.state] <<- solve(correlationMatrix[,,comb.state])
526
+				usestateTF[comb.state] <<- TRUE
527
+				war
528
+			}, error = function(err) {
529
+				correlationMatrix[,,comb.state] <<- diag(num.models)
530
+				determinant[comb.state] <<- det( correlationMatrix[,,comb.state] )
531
+				correlationMatrixInverse[,,comb.state] <<- solve(correlationMatrix[,,comb.state])
532
+				usestateTF[comb.state] <<- TRUE
533
+				err
534
+			})
535
+		}
536
+		message(" done")
537
+
538
+		# Use only states with valid correlationMatrixInverse
539
+		correlationMatrix = correlationMatrix[,,usestateTF]
540
+		correlationMatrixInverse = correlationMatrixInverse[,,usestateTF]
541
+		comb.states = comb.states[usestateTF]
542
+		comb.states <- droplevels(comb.states)
543
+		determinant = determinant[usestateTF]
544
+		num.comb.states <- length(comb.states)
545
+
546
+		## Calculate multivariate densities for each state
547
+		message("Calculating multivariate densities...", appendLF=F)
548
+		densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states))
549
+		for (comb.state in comb.states) {
550
+			istate <- which(comb.state==comb.states)
551
+			state <- strsplit(comb.state, ' ')[[1]]
552
+			z.temp <- matrix(NA, ncol=num.models, nrow=num.bins)
553
+			product <- 1
554
+			for (istrand in 1:num.models) {
555
+				z.temp[,istrand] <- z.per.bin[, istrand, state[istrand]]
556
+				if (models[[istrand]]$distributions[state[istrand],'type'] == 'dnbinom') {
557
+					size <- models[[istrand]]$distributions[state[istrand],'size']
558
+					prob <- models[[istrand]]$distributions[state[istrand],'prob']
559
+					product <- product * dnbinom(reads[,istrand], size, prob)
560
+				} else if (models[[istrand]]$distributions[state[istrand],'type'] == 'dgeom') {
561
+					prob <- models[[istrand]]$distributions[state[istrand],'prob']
562
+					product <- product * dgeom(reads[,istrand], prob)
563
+				} else if (models[[istrand]]$distributions[state[istrand],'type'] == 'delta') {
564
+					product <- product * ifelse(reads[,istrand]==0, 1, 0)
565
+				}
566
+			}
567
+			exponent <- -0.5 * apply( ( z.temp %*% (correlationMatrixInverse[ , , istate] - diag(num.models)) ) * z.temp, 1, sum)
568
+			densities[,istate] <- product * determinant[istate]^(-0.5) * exp( exponent )
569
+		}
570
+		# Check if densities are > 1
571
+# 		if (any(densities>1)) stop("Densities > 1")
572
+# 		if (any(densities<0)) stop("Densities < 0")
573
+		densities[densities>1] <- 1
574
+		densities[densities<0] <- 0
575
+		# Check if densities are 0 everywhere in some bins
576
+		check <- which(apply(densities, 1, sum) == 0)
577
+		if (length(check)>0) {
578
+			if (check[1]==1) {
579
+				densities[1,] <- rep(1e-10, ncol(densities))
580
+				check <- check[-1]
581
+			}
582
+			for (icheck in check) {
583
+				densities[icheck,] <- densities[icheck-1,]
584
+			}
585
+		}
586
+		message(" done\n", appendLF=F)
587
+		
588
+	### Define cleanup behaviour ###
589
+	on.exit(.C("R_multivariate_cleanup", as.integer(num.comb.states)))
590
+
591
+	### Run the multivariate HMM
592
+	# Call the C function
593
+	use.initial <- FALSE
594
+	hmm <- .C("R_multivariate_hmm",
595
+		densities = as.double(densities), # double* D
596
+		num.bins = as.integer(num.bins), # int* T
597
+		num.comb.states = as.integer(num.comb.states), # int* N
598
+		num.strands = as.integer(num.models), # int* Nmod
599
+		comb.states = as.integer(comb.states), # int* comb_states
600
+		num.iterations = as.integer(max.iter), # int* maxiter
601
+		time.sec = as.integer(max.time), # double* maxtime
602
+		loglik.delta = as.double(eps), # double* eps
603
+		states = integer(length=num.bins), # int* states
604
+		A = double(length=num.comb.states*num.comb.states), # double* A
605
+		proba = double(length=num.comb.states), # double* proba
606
+		loglik = double(length=1), # double* loglik
607
+		A.initial = double(length=num.comb.states*num.comb.states), # double* initial_A
608
+		proba.initial = double(length=num.comb.states), # double* initial_proba
609
+		use.initial.params = as.logical(use.initial), # bool* use_initial_params
610
+		num.threads = as.integer(num.threads), # int* num_threads
611
+		error = as.integer(0) # error handling
612
+		)
613
+			
614
+	### Check convergence ###
615
+	war <- NULL
616
+	if (hmm$loglik.delta > eps) {
617
+		war <- warning(paste0("ID = ",ID,": HMM did not converge!\n"))
618
+	}
619
+	if (hmm$error == 1) {
620
+		warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'read.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
621
+	} else if (hmm$error == 2) {
622
+		warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
623
+	}
624
+
625
+	### Make return object ###
626
+		result <- list()
627
+		result$ID <- ID
628
+	## Bin coordinates and states
629
+		result$bins <- binned.data
630
+		result$bins$state <- comb.states[hmm$states]
631
+		# Get states as factors in data.frame
632
+		matrix.states <- matrix(unlist(strsplit(as.character(result$bins$state), split=' ')), byrow=T, ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models)))
633
+		names <- c('mstate','pstate')
634
+		for (i1 in 1:num.models) {
635
+			mcols(result$bins)[names[i1]] <- factor(matrix.states[,i1], levels=levels(models[[i1]]$bins$state))
636
+		}
637
+	## Segmentation
638
+		message("Making segmentation ...", appendLF=F)
639
+		ptm <- proc.time()
640
+		gr <- result$bins
641
+		red.gr.list <- GRangesList()
642
+		for (state in comb.states) {
643
+			red.gr <- GenomicRanges::reduce(gr[gr$state==state])
644
+			mcols(red.gr)$state <- rep(factor(state, levels=levels(gr$state)),length(red.gr))
645
+			red.gr.list[[length(red.gr.list)+1]] <- red.gr
646
+		}
647
+		red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list))
648
+		result$segments <- red.gr
649
+		seqlengths(result$segments) <- seqlengths(result$bins)
650
+		time <- proc.time() - ptm
651
+		message(" ",round(time[3],2),"s")
652
+		## Parameters
653
+			# Weights
654
+			tstates <- table(result$bins$state)
655
+			result$weights <- tstates/sum(tstates)
656
+			result$weights.univariate <- weights
657
+			# Transition matrices
658
+			result$transitionProbs <- matrix(hmm$A, ncol=num.comb.states)
659
+			colnames(result$transitionProbs) <- comb.states
660
+			rownames(result$transitionProbs) <- comb.states
661
+			result$transitionProbs.initial <- matrix(hmm$A.initial, ncol=num.comb.states)
662
+			colnames(result$transitionProbs.initial) <- comb.states
663
+			rownames(result$transitionProbs.initial) <- comb.states
664
+			# Initial probs
665
+			result$startProbs <- hmm$proba
666
+			names(result$startProbs) <- paste0("P(",comb.states,")")
667
+			result$startProbs.initial <- hmm$proba.initial
668
+			names(result$startProbs.initial) <- paste0("P(",comb.states,")")
669
+			# Distributions
670
+			result$distributions <- distributions
671
+			names(result$distributions) <- names(models)
672
+		## Convergence info
673
+			convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec)
674
+			result$convergenceInfo <- convergenceInfo
675
+		## Quality info
676
+			qualityInfo <- list(shannon.entropy=qc.entropy(reads), spikyness=qc.spikyness(reads), complexity=attr(result$bins, 'complexity.preseqR'))
677
+			result$qualityInfo <- qualityInfo
678
+		## Correlation matrices
679
+# 			result$correlation.matrix <- correlationMatrix2use
680
+		## Add class
681
+			class(result) <- class.bivariate.hmm
682
+
683
+	return(result)
684
+
685
+}
686
+
687
+
688
+#' Filter segments by minimal size
689
+#'
690
+#' \code{filterSegments} filters out segments below a specified minimal segment size. This can be useful to get rid of boundary effects from the Hidden Markov approach.
691
+#'
692
+#' @param model A \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} object.
693
+#' @param min.seg.width The minimum segment width in base-pairs. The default \code{NULL} will be twice the bin size of the given \code{model}.
694
+#' @return The input \code{model} with adjusted segments.
695
+#' @author Aaron Taudt
696
+#' @export
697
+filterSegments <- function(model, min.seg.width=NULL) {
698
+	
699
+	if (is.null(min.seg.width)) {
700
+		min.seg.width <- 2*width(model$bins)[1]
701
+	}
702
+
703
+	grl <- GRangesList()
704
+	for (chrom in seqlevels(model$segments)) {
705
+		gr <- model$segments[seqnames(model$segments)==chrom]
706
+		## At the beginning of each chromosome, assign the state after to segments smaller than min.seg.width
707
+		index <- which(width(gr) < min.seg.width)
708
+		if (length(index)==0) {
709
+			grl[[chrom]] <- gr
710
+			next
711
+		}
712
+		post.index <- index+1
713
+		# Filter consecutive indices
714
+		if (length(index)>1) {
715
+			for (i1 in (length(index)-1):1) {
716
+				if (index[i1]==index[i1+1]-1) {
717
+					post.index[i1] <- post.index[i1+1]
718
+				}
719
+			}
720
+		}
721
+		# Only beginning of chromosome
722
+		mask <- post.index==post.index[1]
723
+		post.index <- post.index[mask]
724
+		index <- index[mask]
725
+		if (index[1]==1) {
726
+			gr$state[index] <- gr$state[post.index]
727
+			## Redo segmentation
728
+			red.gr.list <- GRangesList()
729
+			for (state in levels(gr$state)) {
730
+				red.gr <- GenomicRanges::reduce(gr[gr$state==state])
731
+				mcols(red.gr)$state <- rep(factor(state, levels=levels(gr$state)),length(red.gr))
732
+				if (length(red.gr)>0) {
733
+					red.gr.list[[length(red.gr.list)+1]] <- red.gr
734
+				}
735
+			}
736
+			red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list))
737
+			gr <- red.gr
738
+			seqlengths(gr) <- seqlengths(model$bins)
739
+		}
740
+		## For the rest of each chromosome, assign the state before to segments smaller than min.seg.width
741
+		index <- which(width(gr) < min.seg.width)
742
+		prev.index <- index-1
743
+		# Filter consecutive indices
744
+		if (length(index)>1) {
745
+			for (i1 in 2:length(index)) {
746
+				if (index[i1]==index[i1-1]+1) {
747
+					prev.index[i1] <- prev.index[i1-1]
748
+				}
749
+			}
750
+		}
751
+		gr$state[index] <- gr$state[prev.index]
752
+
753
+		grl[[chrom]] <- gr
754
+	}
755
+		
756
+	## Redo segmentation
757
+	gr <- unlist(grl)
758
+	red.gr.list <- GRangesList()
759
+	for (state in levels(gr$state)) {
760
+		red.gr <- GenomicRanges::reduce(gr[gr$state==state])
761
+		mcols(red.gr)$state <- rep(factor(state, levels=levels(gr$state)),length(red.gr))
762
+		red.gr.list[[length(red.gr.list)+1]] <- red.gr
763
+	}
764
+	red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list))
765
+	model$segments <- red.gr
766
+	seqlengths(model$segments) <- seqlengths(model$bins)
767
+
768
+	return(model)
769
+}
770
+
771
+getSCEcoordinates <- function(model) {
772
+
773
+	segments <- model$segments
774
+	segments <- segments[segments$state != 'zero-inflation zero-inflation' & segments$state != 'nullsomy nullsomy' & !grepl('multisomy',segments$state)]
775
+	segments.split <- split(segments, seqnames(segments))
776
+	sce <- GRanges()
777
+	for (chrom in names(segments.split)) {
778
+		segments <- segments.split[[chrom]]
779
+		if (length(segments)>1) {
780
+			balanced.state <- segments$state %in% paste(state.labels.SCE, state.labels.SCE)
781
+			for (i1 in 2:length(balanced.state)) {
782
+				if(balanced.state[i1] != balanced.state[i1-1]) {
783
+					sce <- c(sce, segments[i1])
784
+				}
785
+			}
786
+		}
787
+	}
788
+	mcols(sce) <- NULL
789
+
790
+	return(sce)
791
+}
792
+
... ...
@@ -6,17 +6,27 @@ NULL
6 6
 # =======================================================
7 7
 # Some global variables that can be used in all functions
8 8
 # =======================================================
9
-state.labels <- factor(c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), levels=c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"))
10
-state.distributions <- factor(c('delta','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dpois','dbinom'))
11
-coordinate.names <- c("chrom","start","end")
12
-binned.data.names <- c(coordinate.names,"reads")
13 9
 class.univariate.hmm <- "aneuHMM"
14 10
 class.multivariate.hmm <- "aneuMultiHMM"
11
+class.bivariate.hmm <- "aneuBiHMM"
15 12
 class.hmm.list <- "aneufinder.hmm.list"
16
-state.colors <- c("mapped"="gray68","nullsomy"="gray90","null-mixed"="gray30","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black")
13
+
14
+state.labels <- factor(c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), levels=c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"))
15
+dependent.states.mask <- state.labels %in% c("monosomy","disomy","trisomy","tetrasomy","multisomy")
16
+state.distributions <- factor(c('delta','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dbinom'))
17
+state.colors <- c("mapped"="gray68","nullsomy"="gray90","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black")
17 18
 get.state.labels <- function() { return(state.labels) }
18 19
 get.state.colors <- function() { return(state.colors[as.character(state.labels)]) }
19 20
  
21
+state.labels.SCE <- factor(c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), levels=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"))
22
+multiplicity.SCE <- c(0,0,1,2,3,4,5)
23
+names(multiplicity.SCE) <- state.labels.SCE
24
+dependent.states.mask.SCE <- state.labels.SCE %in% c("monosomy","disomy","trisomy","tetrasomy","multisomy")
25
+state.distributions.SCE <- factor(c('delta','dgeom','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dbinom'))
26
+state.colors.SCE <- c("mapped"="gray68","zero-inflation"="gray90","nullsomy"="gray30","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black")
27
+get.state.labels.SCE <- function() { return(state.labels.SCE) }
28
+get.state.colors.SCE <- function() { return(state.colors[as.character(state.labels.SCE)]) }
29
+
20 30
 # ============================================================================
21 31
 # Functions for a Negative Binomial to transform (mean,variance)<->(size,prob)
22 32
 # ============================================================================
... ...
@@ -65,6 +65,28 @@ plot.aneuHMM <- function(x, type='chromosomes', ...) {
65 65
 	}
66 66
 }
67 67
 
68
+#' Plotting function for \code{\link{aneuBiHMM}} objects
69
+#'
70
+#' Make different types of plots for \code{\link{aneuBiHMM}} objects.
71
+#'
72
+#' @param x An \code{\link{aneuBiHMM}} object.
73
+#' @param type Type of the plot, one of \code{c('chromosomes', 'histogram')}. You can also specify the type with an integer number.
74
+#' \describe{
75
+#'   \item{\code{chromosomes}}{An overview over all chromosomes with CNV-state.}
76
+#'   \item{\code{histogram}}{A histogram of binned read counts with fitted mixture distribution.}
77
+#' }
78
+#' @param ... Additional arguments for the different plot types.
79
+#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object.
80
+#' @method plot aneuBiHMM
81
+#' @export
82
+plot.aneuBiHMM <- function(x, type='chromosomes', ...) {
83
+	if (type == 'chromosomes' | type==1) {
84
+		plotChromosomes(x, both.strands=TRUE, ...)
85
+	} else if (type == 'histogram' | type==2) {
86
+		plotBivariateHistograms(x, ...)
87
+	}
88
+}
89
+
68 90
 # ============================================================
69 91
 # Plot a read histogram
70 92
 # ============================================================
... ...
@@ -75,7 +97,7 @@ plot.aneuHMM <- function(x, type='chromosomes', ...) {
75 97
 #' @param binned.data A \code{\link{binned.data}} object containing binned read counts in meta-column 'reads'.
76 98
 #' @param chromosome,start,end Plot the histogram only for the specified chromosome, start and end position.
77 99
 #' @return A \code{\link[ggplot2:ggplot]{ggplot}} object.
78
-plotBinnedDataHistogram <- function(binned.data, chromosome=NULL, start=NULL, end=NULL) {
100
+plotBinnedDataHistogram <- function(binned.data, strand='*', chromosome=NULL, start=NULL, end=NULL) {
79 101
 
80 102
 	# -----------------------------------------
81 103
 	# Get right x limit
... ...
@@ -109,10 +131,17 @@ plotBinnedDataHistogram <- function(binned.data, chromosome=NULL, start=NULL, en
109 131
 			selectmask <- selectmask & selectend
110 132
 		}
111 133
 	}
134
+	if (strand=='+') {
135
+		select <- 'preads'
136
+	} else if (strand=='-') {
137
+		select <- 'mreads'
138
+	} else if (strand=='*') {
139
+		select <- 'reads'
140
+	}
112 141
 	if (length(which(selectmask)) != length(binned.data$reads)) {
113
-		reads <- binned.data$reads[selectmask]
142
+		reads <- mcols(binned.data)[,select][as.logical(selectmask)]
114 143
 	} else {
115
-		reads <- binned.data$reads
144
+		reads <- mcols(binned.data)[,select]
116 145
 	}
117 146
 
118 147
 	# Find the x limits
... ...
@@ -127,6 +156,57 @@ plotBinnedDataHistogram <- function(binned.data, chromosome=NULL, start=NULL, en
127 156
 
128 157
 }
129 158
 
159
+# =================================================================
160
+# Plot a read histogram with univariate fits for a bivariate HMM
161
+# =================================================================
162
+plotBivariateHistograms <- function(bihmm) {
163
+
164
+	binned.data <- bihmm$bins
165
+	## Stack the two strands
166
+	binned.data.minus <- binned.data
167
+	strand(binned.data.minus) <- '-'
168
+	binned.data.minus$reads <- binned.data.minus$mreads
169
+	binned.data.minus$reads.gc <- binned.data.minus$mreads.gc
170
+	binned.data.plus <- binned.data
171
+	strand(binned.data.plus) <- '+'
172
+	binned.data.plus$reads <- binned.data.plus$preads
173
+	binned.data.plus$reads.gc <- binned.data.plus$preads.gc
174
+	binned.data.stacked <- c(binned.data.minus, binned.data.plus)
175
+	mask.attributes <- c(grep('complexity', names(attributes(binned.data)), value=T), 'spikyness', 'shannon.entropy')
176
+	attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes]
177
+
178
+	## Make fake uni.hmm and plot
179
+	strand <- 'minus'
180
+	uni.hmm <- list()
181
+	uni.hmm$ID <- bihmm$ID
182
+	uni.hmm$bins <- binned.data.stacked
183
+	uni.hmm$bins$state <- uni.hmm$bins$pstate
184
+	uni.hmm$bins$pstate <- NULL
185
+	uni.hmm$bins$mstate <- NULL
186
+	uni.hmm$weights <- bihmm$weights.univariate[[strand]]
187
+	uni.hmm$distributions <- bihmm$distributions[[strand]]
188
+	class(uni.hmm) <- class.univariate.hmm
189
+	ggplts <- plotUnivariateHistogram(uni.hmm, strand='*')
190
+
191
+# 	## Make fake uni.hmm and plot
192
+# 	ggplts <- list()
193
+# 	for (strand in c('plus','minus')) {
194
+# 		uni.hmm <- list()
195
+# 		uni.hmm$ID <- bihmm$ID
196
+# 		uni.hmm$bins <- bihmm$bins
197
+# 		uni.hmm$bins$state <- uni.hmm$bins$pstate
198
+# 		uni.hmm$bins$pstate <- NULL
199
+# 		uni.hmm$bins$mstate <- NULL
200
+# 		uni.hmm$weights <- bihmm$weights.univariate[[strand]]
201
+# 		uni.hmm$distributions <- bihmm$distributions[[strand]]
202
+# 		class(uni.hmm) <- class.univariate.hmm
203
+# 		ggplts[[strand]] <- plotUnivariateHistogram(uni.hmm, strand=strand)
204
+# 	}
205
+	
206
+	return(ggplts)
207
+
208
+}
209
+
130 210
 # ============================================================
131 211
 # Plot a read histogram with univariate fits
132 212
 # ============================================================
... ...
@@ -138,7 +218,7 @@ plotBinnedDataHistogram <- function(binned.data, chromosome=NULL, start=NULL, en
138 218
 #' @param state Plot the histogram only for the specified CNV-state.
139 219
 #' @param chromosome,start,end Plot the histogram only for the specified chromosome, start and end position.
140 220
 #' @return A \code{\link[ggplot2:ggplot]{ggplot}} object.
141
-plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NULL, end=NULL) {
221
+plotUnivariateHistogram <- function(model, state=NULL, strand='*', chromosome=NULL, start=NULL, end=NULL) {
142 222
 
143 223
 	# -----------------------------------------
144 224
 	# Get right x limit
... ...
@@ -175,8 +255,19 @@ plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NU
175 255
 	if (!is.null(state)) {
176 256
 		selectmask <- selectmask & model$bins$state==state
177 257
 	}
178
-	reads <- model$bins$reads[selectmask]
179
-	states <- model$bins$state[selectmask]
258
+	if (strand=='+' | strand=='plus') {
259
+		select <- 'preads'
260
+	} else if (strand=='-' | strand=='minus') {
261
+		select <- 'mreads'
262
+	} else if (strand=='*' | strand=='both') {
263
+		select <- 'reads'
264
+	}
265
+	if (length(which(selectmask)) != length(model$bins$reads)) {
266
+		reads <- mcols(model$bins)[,select][as.logical(selectmask)]
267
+	} else {
268
+		reads <- mcols(model$bins)[,select]
269
+	}
270
+	states <- model$bins$state[as.logical(selectmask)]
180 271
 	if (length(which(selectmask)) != length(model$bins)) {
181 272
 		if (!is.null(state)) {
182 273
 			weights <- rep(NA, length(levels(model$bins$state)))
... ...
@@ -185,6 +276,8 @@ plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NU
185 276
 				weights[istate] <- length(which(states==levels(model$bins$state)[istate==names(weights)]))
186 277
 			}
187 278
 			weights <- weights / length(states)
279
+		} else {
280
+			weights <- model$weights
188 281
 		}
189 282
 	} else {
190 283
 		if (!is.null(model$weights)) {
... ...
@@ -242,7 +335,7 @@ plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NU
242 335
 	if (is.null(state)) {
243 336
 		ggplt <- ggplt + geom_line(aes_string(x='x', y='density', group='state', col='state'), data=distributions)
244 337
 	} else {
245
-		ggplt <- ggplt + geom_line(aes_string(x='x', y='density', group='state', size='state'), data=distributions[distributions$state==state,])
338
+		ggplt <- ggplt + geom_line(aes_string(x='x', y='density', group='state', col='state'), data=distributions[distributions$state==state,])
246 339
 	}
247 340
 	
248 341
 	# Make legend and colors correct
... ...
@@ -268,7 +361,7 @@ plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NU
268 361
 #' @param model A \code{\link{aneuHMM}} object or \code{\link{binned.data}}.
269 362
 #' @param file A PDF file where the plot will be saved.
270 363
 #' @return A \code{\link[ggplot2:ggplot]{ggplot}} object or \code{NULL} if a file was specified.
271
-plotChromosomes <- function(model, file=NULL) {
364
+plotChromosomes <- function(model, both.strands=FALSE, file=NULL) {
272 365
 
273 366
 	if (class(model)=='GRanges') {
274 367
 		binned.data <- model
... ...
@@ -276,11 +369,11 @@ plotChromosomes <- function(model, file=NULL) {
276 369
 		model$ID <- ''
277 370
 		model$bins <- binned.data
278 371
 		model$qualityInfo <- list(shannon.entropy=qc.entropy(binned.data$reads), spikyness=qc.spikyness(binned.data$reads), complexity=attr(binned.data, 'complexity.preseqR'))
279
-		plot.chromosomes.univariate(model, file=file)
372
+		plot.chromosomes(model, both.strands=both.strands, file=file)
280 373
 	} else if (class(model)==class.univariate.hmm) {
281
-		plot.chromosomes.univariate(model, file=file)
282
-	} else if (class(model)==class.multivariate.hmm) {
283
-		plot.chromosomes.bivariate(model, file=file)
374
+		plot.chromosomes(model, both.strands=both.strands, file=file)
375
+	} else if (class(model)==class.bivariate.hmm) {
376
+		plot.chromosomes(model, both.strands=both.strands, percentages=FALSE, file=file)
284 377
 	}
285 378
 
286 379
 }
... ...
@@ -288,7 +381,7 @@ plotChromosomes <- function(model, file=NULL) {
288 381
 # ------------------------------------------------------------
289 382
 # Plot state categorization for all chromosomes
290 383
 # ------------------------------------------------------------
291
-plot.chromosomes.univariate <- function(model, file=NULL) {
384
+plot.chromosomes <- function(model, both.strands=FALSE, percentages=TRUE, file=NULL) {
292 385
 	
293 386
 	## Convert to GRanges
294 387
 	gr <- model$bins
... ...
@@ -297,12 +390,15 @@ plot.chromosomes.univariate <- function(model, file=NULL) {
297 390
 	## Get some variables
298 391
 	num.chroms <- length(levels(seqnames(gr)))
299 392
 	maxseqlength <- max(seqlengths(gr))
300
-	if (!is.null(model$distributions)) {
301
-		custom.xlim <- model$distributions['disomy','mu'] * 4
302
-	} else {
393
+# 	if (!is.null(model$distributions)) {
394
+# 		custom.xlim <- model$distributions['disomy','mu'] * 4
395
+# 	} else {
303 396
 		tab <- table(gr$reads)
304 397
 		tab <- tab[names(tab)!='0']
305 398
 		custom.xlim <- as.numeric(names(tab)[which.max(tab)]) * 4
399
+# 	}
400
+	if (both.strands) {
401
+		custom.xlim <- custom.xlim / 2
306 402
 	}
307 403
 
308 404
 	## Setup page
... ...
@@ -324,11 +420,16 @@ plot.chromosomes.univariate <- function(model, file=NULL) {
324 420
 	quality.string <- paste0('complexity = ',round(model$qualityInfo$complexity),',  spikyness = ',round(model$qualityInfo$spikyness,2),',  entropy = ',round(model$qualityInfo$shannon.entropy,2))
325 421
 	grid.text(quality.string, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:ncols), gp=gpar(fontsize=fs_x))
326 422
 
423
+	## Get SCE coordinates
424
+	if (both.strands) {
425
+		scecoords <- getSCEcoordinates(model)
426
+	}
427
+
327 428
 	## Go through chromosomes and plot
328 429
 	for (i1 in 1:num.chroms) {
329 430
 		# Get the i,j matrix positions of the regions that contain this subplot
330 431
 		matchidx <- as.data.frame(which(layout == i1+nrows.text*ncols, arr.ind = TRUE))
331
-		if (!is.null(grl[[i1]]$state)) {
432
+		if (!is.null(grl[[i1]]$state) & percentages) {
332 433
 			# Percentage of chromosome in state
333 434
 			tstate <- table(mcols(grl[[i1]])$state)
334 435
 			pstate.all <- tstate / sum(tstate)
... ...
@@ -345,10 +446,21 @@ plot.chromosomes.univariate <- function(model, file=NULL) {
345 446
 
346 447
 		# Plot the read counts
347 448
 		dfplot <- as.data.frame(grl[[i1]])
348
-		# Set values to great for plotting to limit
449
+		# Set values too big for plotting to limit
349 450
 			dfplot$reads[dfplot$reads>=custom.xlim] <- custom.xlim
350 451
 			dfplot.points <- dfplot[dfplot$reads>=custom.xlim,]
351 452
 			dfplot.points$reads <- rep(custom.xlim, nrow(dfplot.points))
453
+
454
+			if (both.strands) {
455
+				dfplot$mreads <- - dfplot$mreads	# negative minus reads
456
+				dfplot$preads[dfplot$preads>=custom.xlim] <- custom.xlim
457
+				dfplot$mreads[dfplot$mreads<=-custom.xlim] <- -custom.xlim
458
+				dfplot.points.plus <- dfplot[dfplot$preads>=custom.xlim,]
459
+				dfplot.points.plus$reads <- rep(custom.xlim, nrow(dfplot.points.plus))
460
+				dfplot.points.minus <- dfplot[dfplot$mreads<=-custom.xlim,]
461
+				dfplot.points.minus$reads <- rep(-custom.xlim, nrow(dfplot.points.minus))
462
+			}
463
+
352 464
 		empty_theme <- theme(axis.line=element_blank(),
353 465
       axis.text.x=element_blank(),
354 466
       axis.text.y=element_blank(),
... ...
@@ -362,106 +474,46 @@ plot.chromosomes.univariate <- function(model, file=NULL) {
362 474
       panel.grid.minor=element_blank(),
363 475
       plot.background=element_blank())
364 476
 		ggplt <- ggplot(dfplot, aes_string(x='start', y='reads'))	# data
365
-		ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim-0.1*custom.xlim, ymax=-0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20')	# chromosome backbone as simple rectangle
366 477
 		if (!is.null(grl[[i1]]$state)) {
367
-			ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='reads', col='state'), size=0.2)	# read count
368
-			ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes_string(x='start', y='reads', col='state'), size=2, shape=21)	# outliers
478
+			if (both.strands) {
479
+				ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='preads', col='pstate'), size=0.2)	# read count
480
+				ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='mreads', col='mstate'), size=0.2)	# read count
481
+				ggplt <- ggplt + geom_point(data=dfplot.points.plus, mapping=aes_string(x='start', y='reads', col='pstate'), size=5, shape=21)	# outliers
482
+				ggplt <- ggplt + geom_point(data=dfplot.points.minus, mapping=aes_string(x='start', y='reads', col='mstate'), size=5, shape=21)	# outliers
483
+			} else {
484
+				ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='reads', col='state'), size=0.2)	# read count
485
+				ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes_string(x='start', y='reads', col='state'), size=2, shape=21)	# outliers
486
+			}
369 487
 			ggplt <- ggplt + scale_color_manual(values=state.colors, drop=F)	# do not drop levels if not present
370 488
 		} else {
371
-			ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='reads'), size=0.2, col='gray20')	# read count
372
-			ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes_string(x='start', y='reads'), size=2, shape=21, col='gray20')	# outliers
489
+			if (both.strands) {
490
+				ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='preads'), size=0.2, col='gray20')	# read count
491
+				ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='mreads'), size=0.2, col='gray20')	# read count
492
+				ggplt <- ggplt + geom_point(data=dfplot.points.plus, mapping=aes_string(x='start', y='reads'), size=5, shape=21, col='gray20')	# outliers
493
+				ggplt <- ggplt + geom_point(data=dfplot.points.minus, mapping=aes_string(x='start', y='reads'), size=5, shape=21, col='gray20')	# outliers
494
+			} else {
495
+				ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='reads'), size=0.2, col='gray20')	# read count
496
+				ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes_string(x='start', y='reads'), size=2, shape=21, col='gray20')	# outliers
497
+			}
498
+		}
499
+		if (both.strands) {
500
+			ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim, ymax=0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20')	# chromosome backbone as simple rectangle
501
+		} else {
502
+			ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim-0.1*custom.xlim, ymax=-0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20')	# chromosome backbone as simple rectangle
503
+		}
504
+		if (both.strands) {
505
+			dfsce <- as.data.frame(scecoords[seqnames(scecoords)==names(grl)[i1]])
506
+			if (nrow(dfsce)>0) {
507
+				ggplt <- ggplt + geom_segment(data=dfsce, aes(x=start, xend=start), y=-custom.xlim, yend=-0.5*custom.xlim, arrow=arrow())
508
+			}
373 509
 		}
374 510
 		ggplt <- ggplt + empty_theme	# no axes whatsoever
375 511
 		ggplt <- ggplt + ylab(paste0(seqnames(grl[[i1]])[1], "\n", pstring, "\n", pstring2))	# chromosome names
376
-		ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-0.6*custom.xlim,custom.xlim)	# set x- and y-limits
377
-		ggplt <- ggplt + coord_flip()
378
-		suppressWarnings(
379
-		print(ggplt, vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col))
380
-		)
381
-		
382
-	}
383
-	if (!is.null(file)) {
384
-		d &