Browse code

bivariate analysis added

chakalakka authored on 29/10/2014 15:34:33
Showing 12 changed files

... ...
@@ -10,7 +10,7 @@ bedGraph2binned,
10 10
 findCNVs,
11 11
 
12 12
 plot.distribution,
13
-plot.BAIT,
13
+plot.chromosomes,
14 14
 plot.genome.overview,
15 15
 get.state.table,
16 16
 get.dendrogram,
... ...
@@ -34,7 +34,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
34 34
 		tab5rows <- read.table(file, nrows=5)
35 35
 		classes.in.bed <- sapply(tab5rows, class)
36 36
 		classes <- rep("NULL",length(classes.in.bed))
37
-		classes[1:3] <- classes.in.bed[1:3]
37
+		classes[c(1:3,6)] <- classes.in.bed[c(1:3,6)]
38 38
 		data <- read.table(file, colClasses=classes)
39 39
 		# Convert to GRanges object
40 40
 		data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]+1), strand=Rle(strand("*"), nrow(data)))	# +1 to match coordinate systems
... ...
@@ -86,11 +86,15 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
86 86
 	chroms2use <- intersect(chroms2use, names(chrom.lengths))
87 87
  
88 88
 	## Determine binsize automatically
89
-	if (!is.null(reads.per.bin)) {
89
+# 	if (!is.null(reads.per.bin)) {
90 90
 		cat('Automatically determining binsizes...')
91 91
 		gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use),
92 92
 																	ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use]))
93
-		autodata <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(gr),flag=scanBamFlag(isDuplicate=F)))
93
+		if (format=='bam') {
94
+			autodata <- GenomicAlignments::readGAlignmentsFromBam(file, index=index, param=ScanBamParam(what=c("pos"),which=range(gr),flag=scanBamFlag(isDuplicate=F)))
95
+		} else if (format=='bed' | format=='bedGraph') {
96
+			autodata <- data
97
+		}
94 98
 		numreadsperbp <- length(autodata) / sum(as.numeric(chrom.lengths[chroms2use]))
95 99
 		## Pad binsizes and reads.per.bin with each others value
96 100
 		binsizes.add <- round(reads.per.bin / numreadsperbp, -2)
... ...
@@ -98,7 +102,7 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
98 102
 		binsizes <- c(binsizes, binsizes.add)
99 103
 		reads.per.bin <- c(reads.per.bin.add, reads.per.bin)
100 104
 		cat(' done\n')
101
-	}
105
+# 	}
102 106
 
103 107
 	### Do the loop for all binsizes
104 108
 	if (is.null(numbins)) {
... ...
@@ -167,7 +171,9 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
167 171
 			## Count overlaps
168 172
 			cat("counting overlaps...                     \r")
169 173
 			if (format=="bam" | format=="bed") {
170
-				reads <- GenomicRanges::countOverlaps(i.binned.data, data[seqnames(data)==chromosome])
174
+				mreads <- GenomicRanges::countOverlaps(i.binned.data, data[seqnames(data)==chromosome & strand(data)=='-'])
175
+				preads <- GenomicRanges::countOverlaps(i.binned.data, data[seqnames(data)==chromosome & strand(data)=='+'])
176
+				reads <- mreads + preads
171 177
 			} else if (format=="bedGraph") {
172 178
 				# Take the max value from all regions that fall into / overlap a given bin as read count
173 179
 				midx <- as.matrix(findOverlaps(i.binned.data, data[seqnames(data)==chromosome]))
... ...
@@ -182,14 +188,20 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
182 188
 					maxvalues[i1] <- max(signal[midx[(max.idx[i1-1]+1):(max.idx[i1]),2]])
183 189
 				}
184 190
 				reads[read.idx] <- maxvalues
191
+				mreads <- rep(NA, length(reads))
192
+				preads <- rep(NA, length(reads))
185 193
 			}
186 194
 			
187 195
 			## Concatenate
188 196
 			cat("concatenate...                           \r")
189 197
 			if (is.null(numbins)) {
190 198
 				mcols(i.binned.data)$reads <- reads
199
+				mcols(i.binned.data)$mreads <- mreads
200
+				mcols(i.binned.data)$preads <- preads
191 201
 			} else {
192 202
 				mcols(i.binned.data)$reads <- reads / (end - start)
203
+				mcols(i.binned.data)$mreads <- mreads / (end - start)
204
+				mcols(i.binned.data)$preads <- preads / (end - start)
193 205
 			}
194 206
 
195 207
 			if (separate.chroms==TRUE) {
... ...
@@ -197,9 +209,9 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
197 209
 				if (save.as.RData==TRUE) {
198 210
 					## Print to file
199 211
 					if (is.null(numbins)) {
200
-						filename <- paste0(basename(file),"_binsize_",binsize,"_reads.per.bin_",readsperbin,"_",chromosome,".RData")
212
+						filename <- paste0(basename(file),"_binsize_",format(binsize, scientific=F, trim=T),"_reads.per.bin_",readsperbin,"_",chromosome,"_.RData")
201 213
 					} else {
202
-						filename <- paste0(basename(file),"_numbin_",numbin,"_",chromosome,".RData")
214
+						filename <- paste0(basename(file),"_numbin_",format(numbin, scientific=F, trim=T),"_",chromosome,"_.RData")
203 215
 					}
204 216
 					cat("save...                                  \r")
205 217
 					save(binned.data, file=file.path(outputfolder,filename) )
... ...
@@ -217,9 +229,9 @@ align2binned <- function(file, format, index=file, chrom.length.file, outputfold
217 229
 			if (save.as.RData==TRUE) {
218 230
 				# Print to file
219 231
 				if (is.null(numbins)) {
220
-					filename <- paste(basename(file),"_binsize_",binsize,"_reads.per.bin_",readsperbin,".RData", sep="")
232
+					filename <- paste0(basename(file),"_binsize_",format(binsize, scientific=F, trim=T),"_reads.per.bin_",readsperbin,"_.RData")
221 233
 				} else {
222
-					filename <- paste(basename(file),"_numbin_",numbin,".RData", sep="")
234
+					filename <- paste0(basename(file),"_numbin_",format(numbin, scientific=F, trim=T),"_.RData")
223 235
 				}
224 236
 				cat("Saving to file ...")
225 237
 				save(binned.data, file=file.path(outputfolder,filename) )
... ...
@@ -51,12 +51,16 @@ check.integer = function(testvar) {
51 51
 check.univariate.modellist = function(modellist) {
52 52
 	if (!is(modellist,"list")) return(1)
53 53
 	for (model in modellist) {
54
-		if (!is(model,class.aneufinder.hmm)) return(2)
54
+		if (!is(model,class.univariate.hmm)) return(2)
55 55
 	}
56 56
 	return(0)
57 57
 }
58 58
 check.univariate.model = function(model) {
59
-	if (!is(model,class.aneufinder.hmm)) return(1)
59
+	if (!is(model,class.univariate.hmm)) return(1)
60
+	return(0)
61
+}
62
+check.multivariate.model = function(model) {
63
+	if (!is(model,class.multivariate.hmm)) return(1)
60 64
 	return(0)
61 65
 }
62 66
 
... ...
@@ -72,7 +72,7 @@ hmm2GRanges <- function(hmm, reduce=TRUE) {
72 72
 
73 73
 # 	library(GenomicRanges)
74 74
 	### Check user input ###
75
-	if (check.univariate.model(hmm)!=0) stop("argument 'hmm' expects a univariate hmm object (type ?hmm for help)")
75
+	if (check.univariate.model(hmm)!=0 & check.multivariate.model(hmm)!=0) stop("argument 'hmm' expects a univariate or multivariate hmm object (type ?hmm for help)")
76 76
 	if (check.logical(reduce)!=0) stop("argument 'reduce' expects TRUE or FALSE")
77 77
 
78 78
 	### Create GRanges ###
... ...
@@ -94,6 +94,9 @@ hmm2GRanges <- function(hmm, reduce=TRUE) {
94 94
 		for (state in ustates) {
95 95
 			red.gr <- GenomicRanges::reduce(gr[hmm$states==state])
96 96
 			mcols(red.gr)$state <- rep(factor(state, levels=levels),length(red.gr))
97
+			if (class(hmm)==class.multivariate.hmm) {
98
+				mcols(red.gr)$state.separate <- matrix(rep(strsplit(as.character(state), split=' ')[[1]], length(red.gr)), byrow=T, nrow=length(red.gr))
99
+			}
97 100
 			red.gr.list[[length(red.gr.list)+1]] <- red.gr
98 101
 		}
99 102
 		# Merge and sort
... ...
@@ -104,6 +107,9 @@ hmm2GRanges <- function(hmm, reduce=TRUE) {
104 107
 		mcols(gr)$reads <- hmm$reads
105 108
 		mcols(gr)$posteriors <- hmm$posteriors
106 109
 		mcols(gr)$state <- hmm$states
110
+		if (class(hmm)==class.multivariate.hmm) {
111
+			mcols(gr)$state.separate <- as.matrix(hmm$states.separate)
112
+		}
107 113
 		return(gr)
108 114
 	}
109 115
 
... ...
@@ -1,4 +1,17 @@
1
-findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) {
1
+findCNVs <- function(binned.data, ID, method='univariate', eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) {
2
+
3
+	if (method=='univariate') {
4
+		model <- univariate.findCNVs(binned.data, ID, eps, init, max.time, max.iter, num.trials, eps.try, num.threads, output.if.not.converged, read.cutoff.quantile)
5
+	} else if (method=='bivariate') {
6
+		model <- bivariate.findCNVs(binned.data, ID, eps, init, max.time, max.iter, num.trials, eps.try, num.threads, output.if.not.converged, read.cutoff.quantile)
7
+	}
8
+
9
+	return(model)
10
+
11
+}
12
+
13
+
14
+univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) {
2 15
 
3 16
 	## Intercept user input
4 17
 	IDcheck <- ID  #trigger error if not defined
... ...
@@ -32,12 +45,14 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max
32 45
 	}
33 46
 
34 47
 	# Filter high reads out, makes HMM faster
35
-	read.cutoff <- as.integer(quantile(reads, read.cutoff.quantile))
48
+	read.cutoff <- quantile(reads, read.cutoff.quantile)
49
+	names.read.cutoff <- names(read.cutoff)
50
+	read.cutoff <- as.integer(read.cutoff)
36 51
 	mask <- reads > read.cutoff
37 52
 	reads[mask] <- read.cutoff
38 53
 	numfiltered <- length(which(mask))
39 54
 	if (numfiltered > 0) {
40
-		cat(paste0("Replaced read counts > ",read.cutoff," (",names(read.cutoff)," quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'read.cutoff.quantile=1' to disable this filtering.\n"))
55
+		cat(paste0("Replaced read counts > ",read.cutoff," (",names.read.cutoff," quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'read.cutoff.quantile=1' to disable this filtering. This filtering was done to increase the speed of the HMM and should not affect the results.\n"))
41 56
 	}
42 57
 	
43 58
 	## Call univariate in a for loop to enable multiple trials
... ...
@@ -81,6 +96,7 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max
81 96
 			reads = as.integer(reads), # int* O
82 97
 			num.bins = as.integer(numbins), # int* T
83 98
 			num.states = as.integer(numstates), # int* N
99
+			state.labels = as.integer(state.labels), # int* state_labels
84 100
 			size = double(length=numstates), # double* size
85 101
 			prob = double(length=numstates), # double* prob
86 102
 			lambda = double(length=numstates), # double* lambda
... ...
@@ -128,6 +144,7 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max
128 144
 			reads = as.integer(reads), # int* O
129 145
 			num.bins = as.integer(numbins), # int* T
130 146
 			num.states = as.integer(numstates), # int* N
147
+			state.labels = as.integer(state.labels), # int* state_labels
131 148
 			size = double(length=numstates), # double* size
132 149
 			prob = double(length=numstates), # double* prob
133 150
 			lambda = double(length=numstates), # double* lambda
... ...
@@ -160,8 +177,9 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max
160 177
 	hmm$coordinates <- data.frame(as.character(seqnames(binned.data)), start(ranges(binned.data)), end(ranges(binned.data)))
161 178
 	names(hmm$coordinates) <- coordinate.names
162 179
 	hmm$seqlengths <- seqlengths(binned.data)
163
-	class(hmm) <- class.aneufinder.hmm
164
-	hmm$states <- factor(state.labels, levels=state.labels)[hmm$states+1]
180
+	class(hmm) <- class.univariate.hmm
181
+	hmm$states <- state.labels[hmm$states]
182
+	hmm$state.labels <- state.labels
165 183
 	hmm$eps <- eps
166 184
 	hmm$A <- matrix(hmm$A, ncol=hmm$num.states)
167 185
 	rownames(hmm$A) <- state.labels
... ...
@@ -224,3 +242,225 @@ findCNVs <- function(binned.data, ID, eps=0.001, init="random", max.time=-1, max
224 242
 		return(hmm)
225 243
 	}
226 244
 }
245
+
246
+
247
+bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, output.if.not.converged=FALSE, read.cutoff.quantile=0.999) {
248
+
249
+	### Split into strands and run univariate findCNVs
250
+	binned.data.plus <- binned.data
251
+	binned.data.plus$reads <- binned.data$preads
252
+	plus.model <- univariate.findCNVs(binned.data.plus, ID, eps, init, max.time, max.iter, num.trials, eps.try, num.threads, output.if.not.converged, read.cutoff.quantile)
253
+	binned.data.minus <- binned.data
254
+	binned.data.minus$reads <- binned.data$mreads
255
+	minus.model <- univariate.findCNVs(binned.data.minus, ID, eps, init, max.time, max.iter, num.trials, eps.try, num.threads, output.if.not.converged, read.cutoff.quantile)
256
+	models <- list(plus=plus.model, minus=minus.model)
257
+
258
+	### Prepare the multivariate HMM
259
+		## Extract reads and other stuff
260
+		coordinates <- models[[1]]$coordinates
261
+		seqlengths <- models[[1]]$seqlengths
262
+		distributions <- lapply(models, '[[', 'distributions')
263
+		weights <- lapply(models, '[[', 'weights')
264
+		num.uni.states <- models[[1]]$num.states
265
+		num.models <- length(models)
266
+		num.bins <- models[[1]]$num.bins
267
+		comb.states <- paste(levels(models[[1]]$states), levels(models[[2]]$states))
268
+		comb.states <- factor(comb.states, levels=comb.states)
269
+		comb.states.per.bin <- factor(do.call(paste, lapply(models, '[[', 'states')), levels=levels(comb.states))
270
+		num.comb.states <- length(comb.states)
271
+		reads <- matrix(c(models$plus$reads, models$minus$reads), ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models)))
272
+		maxreads <- max(reads)
273
+
274
+		## Pre-compute z-values for each number of reads
275
+		cat("Computing pre z-matrix...")
276
+		z.per.read <- array(NA, dim=c(maxreads+1, num.models, num.uni.states))
277
+		xreads <- 0:maxreads
278
+		for (istrand in 1:num.models) {
279
+			model <- models[[istrand]]
280
+			for (istate in 1:num.uni.states) {
281
+				if (model$distributions[istate,'type']=='dnbinom') {
282
+					size <- model$distributions[istate,'size']
283
+					prob <- model$distributions[istate,'prob']
284
+					u <- pnbinom(xreads, size, prob)
285
+				} else if (model$distributions[istate,'type']=='delta') {
286
+					u <- rep(1, length(xreads))
287
+				}
288
+				qnorm_u <- qnorm(u)
289
+				mask <- qnorm_u==Inf
290
+				qnorm_u[mask] <- qnorm(1-1e-16)
291
+				z.per.read[ , istrand, istate] <- qnorm_u
292
+			}
293
+		}
294
+		cat(" done\n")
295
+
296
+		## Compute the z matrix
297
+		cat("Transfering values into z-matrix...")
298
+		z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(models), levels(modellist[[1]]$states)))
299
+		for (istrand in 1:num.models) {
300
+			model <- models[[istrand]]
301
+			for (istate in 1:num.uni.states) {
302
+				z.per.bin[ , istrand, istate] = z.per.read[reads[,istrand]+1, istrand, istate]
303
+			}
304
+		}
305
+		remove(z.per.read)
306
+		cat(" done\n")
307
+
308
+		## Calculate correlation matrix
309
+		cat("Computing inverse of correlation matrix...")
310
+		correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(models), strand=names(models), comb.state=comb.states))
311
+		correlationMatrixInverse <- correlationMatrix
312
+		determinant <- rep(0, num.comb.states)
313
+		names(determinant) <- comb.states
314
+		usestateTF <- rep(NA, num.comb.states) # TRUE, FALSE vector for usable states
315
+		names(usestateTF) <- comb.states
316
+		for (state in comb.states) {
317
+			istate <- strsplit(state, ' ')[[1]]
318
+			mask <- which(comb.states.per.bin==state)
319
+			# Subselect z
320
+			z.temp <- matrix(NA, ncol=length(istate), nrow=length(mask))
321
+			for (i1 in 1:length(istate)) {
322
+				z.temp[,i1] <- z.per.bin[mask, i1, istate[i1]]
323
+			}
324
+			temp <- tryCatch({
325
+				correlationMatrix[,,state] <- cor(z.temp)
326
+				determinant[state] <- det( correlationMatrix[,,state] )
327
+				correlationMatrixInverse[,,state] <- solve(correlationMatrix[,,state])
328
+				if (length(z.temp)>0) {
329
+					usestateTF[state] <- TRUE
330
+				} else {
331
+					usestateTF[state] <- FALSE
332
+				}
333
+			}, warning = function(war) {
334
+				usestateTF[state] <<- FALSE
335
+				war
336
+			}, error = function(err) {
337
+				usestateTF[state] <<- FALSE
338
+				err
339
+			})
340
+		}
341
+# 		remove(z.per.bin)
342
+		cat(" done\n")
343
+
344
+		# Use nullsomy state anyways
345
+		usestateTF[1] <- TRUE
346
+		correlationMatrix = correlationMatrix[,,usestateTF]
347
+		correlationMatrixInverse = correlationMatrixInverse[,,usestateTF]
348
+		comb.states = comb.states[usestateTF]
349
+		comb.states <- droplevels(comb.states)
350
+		determinant = determinant[usestateTF]
351
+		num.comb.states <- length(comb.states)
352
+
353
+		## Calculate multivariate densities for each state
354
+		cat("Calculating multivariate densities...")
355
+		densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states))
356
+		for (state in comb.states) {
357
+			i <- which(state==comb.states)
358
+			istate <- strsplit(state, ' ')[[1]]
359
+			product <- 1
360
+			for (istrand in 1:num.models) {
361
+				if (models[[istrand]]$distributions[istate[istrand],'type'] == 'dnbinom') {
362
+					exponent <- -0.5 * apply( ( z.per.bin[ , , i] %*% (correlationMatrixInverse[ , , i] - diag(num.models)) ) * z.per.bin[ , , i], 1, sum)
363
+					size <- models[[istrand]]$distributions[istate[istrand],'size']
364
+					prob <- models[[istrand]]$distributions[istate[istrand],'prob']
365
+					product <- product * dnbinom(reads[,istrand], size, prob)
366
+					densities[,i] <- product * determinant[i]^(-0.5) * exp( exponent )
367
+				}
368
+				else if (models[[istrand]]$distributions[istate[istrand],'type'] == 'delta') {
369
+					densities[,i] <- densities[,i] * ifelse(reads[,istrand]==0, 1, 0)
370
+				}
371
+			}
372
+		}
373
+		# Check if densities are > 1
374
+		if (any(densities>1)) stop("Densities > 1")
375
+		if (any(densities<0)) stop("Densities < 0")
376
+		densities[densities>1] <- 1
377
+		densities[densities<0] <- 0
378
+		# Check if densities are 0 everywhere in some bins
379
+		check <- which(apply(densities, 1, sum) == 0)
380
+		if (length(check)>0) {
381
+			if (check[1]==1) {
382
+				densities[1,] <- rep(1e-10, ncol(densities))
383
+				check <- check[-1]
384
+			}
385
+			for (icheck in check) {
386
+				densities[icheck,] <- densities[icheck-1,]
387
+			}
388
+		}
389
+		cat(" done\n")
390
+		
391
+
392
+	### Run the multivariate HMM
393
+	# Call the C function
394
+	use.initial <- FALSE
395
+	hmm <- .C("R_multivariate_hmm",
396
+		densities = as.double(densities), # double* D
397
+		num.bins = as.integer(num.bins), # int* T
398
+		num.comb.states = as.integer(num.comb.states), # int* N
399
+		num.strands = as.integer(num.models), # int* Nmod
400
+		comb.states = as.integer(comb.states), # int* comb_states
401
+		num.iterations = as.integer(max.iter), # int* maxiter
402
+		time.sec = as.integer(max.time), # double* maxtime
403
+		loglik.delta = as.double(eps), # double* eps
404
+		states = integer(length=num.bins), # int* states
405
+		A = double(length=num.comb.states*num.comb.states), # double* A
406
+		proba = double(length=num.comb.states), # double* proba
407
+		loglik = double(length=1), # double* loglik
408
+		A.initial = double(length=num.comb.states*num.comb.states), # double* initial_A
409
+		proba.initial = double(length=num.comb.states), # double* initial_proba
410
+		use.initial.params = as.logical(use.initial), # bool* use_initial_params
411
+		num.threads = as.integer(num.threads), # int* num_threads
412
+		error = as.integer(0) # error handling
413
+		)
414
+			
415
+	# Add useful entries
416
+	class(hmm) <- class.multivariate.hmm
417
+	hmm$coordinates <- coordinates
418
+	hmm$seqlengths <- seqlengths
419
+	hmm$densities <- densities	# reassign because of matrix layout
420
+	hmm$reads <- reads
421
+	hmm$states <- comb.states[hmm$states]
422
+	# Get states as factors in data.frame
423
+		matrix.states <- matrix(unlist(strsplit(as.character(hmm$states), split=' ')), byrow=T, ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models)))
424
+		toFactor <- function(column) { factor(column, levels=levels(models[[1]]$states)) }
425
+		hmm$states.separate <- list()
426
+		for (i1 in 1:num.models) {
427
+			hmm$states.separate[[names(models)[i1]]] <- toFactor(matrix.states[,i1])
428
+		}
429
+		hmm$states.separate <- as.data.frame(hmm$states.separate)
430
+	hmm$comb.states <- comb.states
431
+	hmm$eps <- eps
432
+	hmm$A <- matrix(hmm$A, ncol=num.comb.states)
433
+	colnames(hmm$A) <- comb.states
434
+	rownames(hmm$A) <- comb.states
435
+	names(hmm$proba) <- comb.states
436
+	hmm$ID <- ID
437
+	hmm$distributions.univariate <- distributions
438
+	hmm$weights.univariate <- weights
439
+	hmm$A.initial <- matrix(hmm$A.initial, ncol=num.comb.states)
440
+	colnames(hmm$A.initial) <- comb.states
441
+	rownames(hmm$A.initial) <- comb.states
442
+	names(hmm$proba.initial) <- comb.states
443
+	hmm$correlation.matrix <- correlationMatrix
444
+	hmm$correlation.matrix.inverse <- correlationMatrixInverse
445
+
446
+	# Delete redundant entries
447
+	hmm$use.initial.params <- NULL
448
+
449
+	# Check convergence
450
+	war <- NULL
451
+	if (hmm$loglik.delta > hmm$eps) {
452
+		war <- warning("HMM did not converge!\n")
453
+	}
454
+	if (hmm$error == 1) {
455
+		stop("A nan occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your read counts for very high numbers, they could be the cause for this problem.")
456
+	} else if (hmm$error == 2) {
457
+		stop("An error occurred during the Baum-Welch! Parameter estimation terminated prematurely.")
458
+	}
459
+
460
+	hmms <- list(bivariate=hmm, univariate.plus=models$plus, univariate.minus=models$minus)
461
+	class(hmms) <- class.hmm.list
462
+	return(hmms)
463
+
464
+}
465
+
466
+
... ...
@@ -1,11 +1,13 @@
1 1
 # =======================================================
2 2
 # Some global variables that can be used in all functions
3 3
 # =======================================================
4
-state.labels <- c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")
4
+state.labels <- factor(c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), levels=c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"))
5 5
 state.distributions <- factor(c('delta','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dpois','dbinom'))
6 6
 coordinate.names <- c("chrom","start","end")
7 7
 binned.data.names <- c(coordinate.names,"reads")
8
-class.aneufinder.hmm <- "aneufinder.hmm"
8
+class.univariate.hmm <- "aneufinder.univariate.hmm"
9
+class.multivariate.hmm <- "aneufinder.multivariate.hmm"
10
+class.hmm.list <- "aneufinder.hmm.list"
9 11
 state.colors <- c("mapped"="gray68","nullsomy"="gray20","null-mixed"="gray30","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black")
10 12
 get.state.labels <- function() { return(state.labels) }
11 13
 get.state.colors <- function() { return(state.colors[state.labels]) }
... ...
@@ -48,7 +48,7 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL
48 48
 		states <- model$states[selectmask]
49 49
 		weights <- rep(NA, 3)
50 50
 		for (i1 in 1:length(levels(model$states))) {
51
-			weights[i1] <- length(which(states==state.labels[i1]))
51
+			weights[i1] <- length(which(states==model$state.labels[i1]))
52 52
 		}
53 53
 		weights <- weights / length(states)
54 54
 	} else {
... ...
@@ -66,6 +66,7 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL
66 66
 	ggplt <- ggplot(data.frame(reads)) + geom_histogram(aes(x=reads, y=..density..), binwidth=1, color='black', fill='white') + xlim(0,rightxlim) + theme_bw() + xlab("read count")
67 67
 
68 68
 	### Add fits to the histogram
69
+	c.state.labels <- as.character(model$state.labels)
69 70
 	numstates <- model$num.states
70 71
 	x <- 0:rightxlim
71 72
 	distributions <- list(x)
... ...
@@ -92,12 +93,12 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL
92 93
 		}
93 94
 	}
94 95
 	distributions <- as.data.frame(distributions)
95
-	names(distributions) <- c("x",state.labels)
96
+	names(distributions) <- c("x",c.state.labels)
96 97
 	# Total
97 98
 	distributions$total <- apply(distributions[-1], 1, sum)
98 99
 
99 100
 	# Reshape the data.frame for plotting with ggplot
100
-	distributions <- reshape(distributions, direction="long", varying=1+1:(numstates+1), v.names="density", timevar="state", times=c(state.labels,"total"))
101
+	distributions <- reshape(distributions, direction="long", varying=1+1:(numstates+1), v.names="density", timevar="state", times=c(c.state.labels,"total"))
101 102
 	### Plot the distributions
102 103
 	if (is.null(state)) {
103 104
 		ggplt <- ggplt + geom_line(aes(x=x, y=density, group=state, col=state), data=distributions)
... ...
@@ -108,9 +109,9 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL
108 109
 	# Make legend and colors correct
109 110
 	lmeans <- round(model$distributions[,'mu'], 2)
110 111
 	lvars <- round(model$distributions[,'variance'], 2)
111
-	legend <- paste(state.labels, ", mean=", lmeans, ", var=", lvars, sep='')
112
+	legend <- paste(c.state.labels, ", mean=", lmeans, ", var=", lvars, sep='')
112 113
 	legend <- c(legend, paste0('total, mean(data)=', round(mean(reads),2), ', var(data)=', round(var(reads),2)))
113
-	ggplt <- ggplt + scale_color_manual(breaks=c(state.labels, 'total'), values=state.colors[c(state.labels,'total')], labels=legend)
114
+	ggplt <- ggplt + scale_color_manual(breaks=c(c.state.labels, 'total'), values=state.colors[c(c.state.labels,'total')], labels=legend)
114 115
 	ggplt <- ggplt + theme(legend.position=c(1,1), legend.justification=c(1,1))
115 116
 
116 117
 	return(ggplt)
... ...
@@ -139,9 +140,22 @@ plot.boxplot <- function(model) {
139 140
 
140 141
 
141 142
 # ------------------------------------------------------------
142
-# Plot like BAIT
143
+# Plot state categorization for all chromosomes
143 144
 # ------------------------------------------------------------
144
-plot.BAIT <- function(model, file='aneufinder_BAIT_plots') {
145
+plot.chromosomes <- function(model, file=NULL) {
146
+
147
+	if (class(model)==class.univariate.hmm) {
148
+		plot.chromosomes.univariate(model, file=file)
149
+	} else if (class(model)==class.multivariate.hmm) {
150
+		plot.chromosomes.bivariate(model, file=file)
151
+	}
152
+
153
+}
154
+
155
+# ------------------------------------------------------------
156
+# Plot state categorization for all chromosomes
157
+# ------------------------------------------------------------
158
+plot.chromosomes.univariate <- function(model, file=NULL) {
145 159
 	
146 160
 	## Convert to GRanges
147 161
 	gr <- hmm2GRanges(model, reduce=F)
... ...
@@ -157,12 +171,14 @@ plot.BAIT <- function(model, file='aneufinder_BAIT_plots') {
157 171
 	library(ggplot2)
158 172
 	nrows <- 2	# rows for plotting chromosomes
159 173
 	ncols <- ceiling(num.chroms/nrows)
160
-	png(file=paste0(file, '.png'), width=ncols*6, height=nrows*21, units='cm', res=150)
174
+	if (!is.null(file)) {
175
+		pdf(file=paste0(file, '.pdf'), width=ncols*2.4, height=nrows*8.3)
176
+	}
161 177
 	grid.newpage()
162 178
 	layout <- matrix(1:((nrows+1)*ncols), ncol=ncols, nrow=nrows+1, byrow=T)
163 179
 	pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), heights=c(1,21,21))))
164 180
 	# Main title
165
-	grid.text(file, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:ncols), gp=gpar(fontsize=26))
181
+	grid.text(model$ID, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:ncols), gp=gpar(fontsize=26))
166 182
 
167 183
 	## Go through chromosomes and plot
168 184
 	for (i1 in 1:num.chroms) {
... ...
@@ -181,8 +197,10 @@ plot.BAIT <- function(model, file='aneufinder_BAIT_plots') {
181 197
 
182 198
 		# Plot the read counts
183 199
 		dfplot <- as.data.frame(grl[[i1]])
184
-		dfplot.points <- dfplot[dfplot$reads>=custom.xlim,]
185
-		dfplot$reads <- stats::runmed(dfplot$reads, 11)
200
+		# Set values to great for plotting to limit
201
+			dfplot$reads[dfplot$reads>=custom.xlim] <- custom.xlim
202
+			dfplot.points <- dfplot[dfplot$reads>=custom.xlim,]
203
+			dfplot.points$reads <- rep(custom.xlim, nrow(dfplot.points))
186 204
 		empty_theme <- theme(axis.line=element_blank(),
187 205
       axis.text.x=element_blank(),
188 206
       axis.text.y=element_blank(),
... ...
@@ -197,19 +215,111 @@ plot.BAIT <- function(model, file='aneufinder_BAIT_plots') {
197 215
       plot.background=element_blank())
198 216
 		ggplt <- ggplot(dfplot, aes(x=start, y=reads))	# data
199 217
 		ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads, col=state), size=0.2)	# read count
200
-		ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-0.6*custom.xlim,custom.xlim)	# set x- and y-limits
201 218
 		ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim-0.1*custom.xlim, ymax=-0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20')	# chromosome backbone as simple rectangle
202
-		ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes(x=start, y=reads, col=state))	# outliers
219
+		ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes(x=start, y=reads, col=state), size=5, shape=21)	# outliers
203 220
 		ggplt <- ggplt + scale_color_manual(values=state.colors, drop=F)	# do not drop levels if not present
204 221
 		ggplt <- ggplt + empty_theme	# no axes whatsoever
205 222
 		ggplt <- ggplt + ylab(paste0(seqnames(grl[[i1]])[1], "\n", pstring, "\n", pstring2))	# chromosome names
223
+		ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-0.6*custom.xlim,custom.xlim)	# set x- and y-limits
206 224
 		ggplt <- ggplt + coord_flip()
207 225
 		suppressWarnings(
208 226
 		print(ggplt, vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col))
209 227
 		)
210 228
 		
211 229
 	}
212
-	d <- dev.off()
230
+	if (!is.null(file)) {
231
+		d <- dev.off()
232
+	}
233
+}
234
+
235
+
236
+
237
+# ------------------------------------------------------------
238
+# Plot state categorization for all chromosomes
239
+# ------------------------------------------------------------
240
+plot.chromosomes.bivariate <- function(model, file=NULL) {
241
+	
242
+	## Convert to GRanges
243
+	gr <- hmm2GRanges(model, reduce=F)
244
+	grl <- split(gr, seqnames(gr))
245
+
246
+	## Get some variables
247
+	num.chroms <- length(levels(seqnames(gr)))
248
+	maxseqlength <- max(seqlengths(gr))
249
+	custom.xlim <- model$distributions.univariate[[1]]['monosomy','mu'] * 10
250
+
251
+	## Setup page
252
+	library(grid)
253
+	library(ggplot2)
254
+	nrows <- 2	# rows for plotting chromosomes
255
+	ncols <- ceiling(num.chroms/nrows)
256
+	if (!is.null(file)) {
257
+		pdf(file=paste0(file, '.pdf'), width=ncols*2.4, height=nrows*8.3)
258
+	}
259
+	grid.newpage()
260
+	layout <- matrix(1:((nrows+1)*ncols), ncol=ncols, nrow=nrows+1, byrow=T)
261
+	pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), heights=c(1,21,21))))
262
+	# Main title
263
+	grid.text(model$ID, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:ncols), gp=gpar(fontsize=26))
264
+
265
+	## Go through chromosomes and plot
266
+	for (i1 in 1:num.chroms) {
267
+		# Get the i,j matrix positions of the regions that contain this subplot
268
+		matchidx <- as.data.frame(which(layout == i1+ncols, arr.ind = TRUE))
269
+
270
+		# Percentage of chromosome in state
271
+		tstate <- table(mcols(grl[[i1]])$state)
272
+		pstate.all <- tstate / sum(tstate)
273
+		pstate <- round(pstate.all*100)[-1]	# without 'nullsomy / unmapped' state
274
+		pstring <- apply(pstate, 1, function(x) { paste0(": ", x, "%") })
275
+		pstring <- paste0(names(pstring), pstring)
276
+		pstring <- paste(pstring[which.max(pstate)], collapse="\n")
277
+		pstring2 <- round(pstate.all*100)[1]	# only 'nullsomy / unmapped'
278
+		pstring2 <- paste0(names(pstring2), ": ", pstring2, "%")
279
+
280
+		## Convert to data.frame for plotting and prepare the data.frame
281
+			dfplot <- as.data.frame(grl[[i1]])
282
+			dfplot$reads.minus <- - dfplot$reads.minus
283
+		# Set values to great for plotting to limit
284
+			dfplot$reads.plus[dfplot$reads.plus>=custom.xlim] <- custom.xlim
285
+			dfplot$reads.minus[dfplot$reads.minus<=-custom.xlim] <- -custom.xlim
286
+			dfplot.points.plus <- dfplot[dfplot$reads.plus>=custom.xlim,]
287
+			dfplot.points.plus$reads <- rep(custom.xlim, nrow(dfplot.points.plus))
288
+			dfplot.points.minus <- dfplot[dfplot$reads.minus<=-custom.xlim,]
289
+			dfplot.points.minus$reads <- rep(-custom.xlim, nrow(dfplot.points.minus))
290
+		# No axes and labels
291
+		empty_theme <- theme(axis.line=element_blank(),
292
+      axis.text.x=element_blank(),
293
+      axis.text.y=element_blank(),
294
+      axis.ticks=element_blank(),
295
+			axis.title.x=element_text(size=20),
296
+      axis.title.y=element_blank(),
297
+      legend.position="none",
298
+      panel.background=element_blank(),
299
+      panel.border=element_blank(),
300
+      panel.grid.major=element_blank(),
301
+      panel.grid.minor=element_blank(),
302
+      plot.background=element_blank())
303
+		# Plot
304
+		ggplt <- ggplot(dfplot, aes(x=start, y=reads.plus))	# data
305
+		ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads.plus, col=state.separate.plus), size=0.2)	# read count
306
+		ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads.minus, col=state.separate.minus), size=0.2)	# read count
307
+		ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim, ymax=0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20')	# chromosome backbone as simple rectangle
308
+		ggplt <- ggplt + geom_point(data=dfplot.points.plus, mapping=aes(x=start, y=reads, col=state.separate.plus), size=5, shape=21)	# outliers
309
+		ggplt <- ggplt + geom_point(data=dfplot.points.minus, mapping=aes(x=start, y=reads, col=state.separate.minus), size=5, shape=21)	# outliers
310
+		ggplt <- ggplt + scale_color_manual(values=state.colors, drop=F)	# do not drop levels if not present
311
+		ggplt <- ggplt + empty_theme	# no axes whatsoever
312
+		ggplt <- ggplt + ylab(paste0(seqnames(grl[[i1]])[1], "\n", pstring, "\n", pstring2))	# chromosome names
313
+		ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-custom.xlim,custom.xlim)	# set x- and y-limits
314
+		ggplt <- ggplt + coord_flip()
315
+		suppressWarnings(
316
+		print(ggplt, vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col))
317
+		)
318
+		
319
+	}
320
+	if (!is.null(file)) {
321
+		d <- dev.off()
322
+	}
213 323
 }
214 324
 
215 325
 
... ...
@@ -241,7 +351,7 @@ plot.genome.overview <- function(modellist, file='aneufinder_genome_overview', n
241 351
 	library(ggplot2)
242 352
 	nrows <- length(uni.hmm.grl)	# rows for plotting genomes
243 353
 	ncols <- 1
244
-	png(file=paste0(file, '.png'), width=ncols*60, height=nrows*5, units='cm', res=150)
354
+	pdf(file=paste0(file, '.pdf'), width=ncols*24, height=nrows*2)
245 355
 	grid.newpage()
246 356
 	layout <- matrix(1:((nrows+1)*ncols), ncol=ncols, nrow=nrows+1, byrow=T)
247 357
 	pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout), heights=c(1,rep(10,length(uni.hmm.grl))))))
... ...
@@ -7,7 +7,7 @@
7 7
 // This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R.
8 8
 // ===================================================================================================================================================
9 9
 extern "C" {
10
-void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, double* lambda, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, int* iniproc, double* initial_size, double* initial_prob, double* initial_lambda, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff)
10
+void R_univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, double* lambda, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, int* iniproc, double* initial_size, double* initial_prob, double* initial_lambda, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff)
11 11
 {
12 12
 
13 13
 	// Define logging level
... ...
@@ -155,7 +155,7 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, double
155 155
 		{
156 156
 			posterior_per_t[iN] = hmm->get_posterior(iN, t);
157 157
 		}
158
-		states[t] = argMax(posterior_per_t, *N);
158
+		states[t] = state_labels[argMax(posterior_per_t, *N)];
159 159
 	}
160 160
 
161 161
 	FILE_LOG(logDEBUG1) << "Return parameters";
... ...
@@ -210,3 +210,135 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, double
210 210
 }
211 211
 } // extern C
212 212
 
213
+
214
+// =====================================================================================================================================================
215
+// This function takes parameters from R, creates a multivariate HMM object, runs the Baum-Welch and returns the result to R.
216
+// =====================================================================================================================================================
217
+extern "C" {
218
+void R_multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error)
219
+{
220
+
221
+	// Define logging level {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"}
222
+// 	FILE* pFile = fopen("chromStar.log", "w");
223
+// 	Output2FILE::Stream() = pFile;
224
+//  	FILELog::ReportingLevel() = FILELog::FromString("ITERATION");
225
+//  	FILELog::ReportingLevel() = FILELog::FromString("DEBUG2");
226
+ 	FILELog::ReportingLevel() = FILELog::FromString("ERROR");
227
+
228
+	// Parallelization settings
229
+// 	omp_set_num_threads(*num_threads);
230
+
231
+	// Print some information
232
+	FILE_LOG(logINFO) << "number of states = " << *N;
233
+	Rprintf("number of states = %d\n", *N);
234
+	FILE_LOG(logINFO) << "number of bins = " << *T;
235
+	Rprintf("number of bins = %d\n", *T);
236
+	if (*maxiter < 0)
237
+	{
238
+		FILE_LOG(logINFO) << "maximum number of iterations = none";
239
+		Rprintf("maximum number of iterations = none\n");
240
+	} else {
241
+		FILE_LOG(logINFO) << "maximum number of iterations = " << *maxiter;
242
+		Rprintf("maximum number of iterations = %d\n", *maxiter);
243
+	}
244
+	if (*maxtime < 0)
245
+	{
246
+		FILE_LOG(logINFO) << "maximum running time = none";
247
+		Rprintf("maximum running time = none\n");
248
+	} else {
249
+		FILE_LOG(logINFO) << "maximum running time = " << *maxtime << " sec";
250
+		Rprintf("maximum running time = %d sec\n", *maxtime);
251
+	}
252
+	FILE_LOG(logINFO) << "epsilon = " << *eps;
253
+	Rprintf("epsilon = %g\n", *eps);
254
+	FILE_LOG(logINFO) << "number of modifications = " << *Nmod;
255
+	Rprintf("number of modifications = %d\n", *Nmod);
256
+
257
+	// Flush Rprintf statements to console
258
+	R_FlushConsole();
259
+
260
+	// Recode the densities vector to matrix representation
261
+// 	clock_t clocktime = clock(), dtime;
262
+	double** multiD = allocDoubleMatrix(*N, *T);
263
+	for (int iN=0; iN<*N; iN++)
264
+	{
265
+		for (int t=0; t<*T; t++)
266
+		{
267
+			multiD[iN][t] = D[iN*(*T)+t];
268
+		}
269
+	}
270
+// 	dtime = clock() - clocktime;
271
+// 	FILE_LOG(logDEBUG1) << "recoding densities vector to matrix representation: " << dtime << " clicks";
272
+
273
+	// Create the HMM
274
+	FILE_LOG(logDEBUG1) << "Creating the multivariate HMM";
275
+	ScaleHMM* hmm = new ScaleHMM(*T, *N, *Nmod, multiD);
276
+	// Initialize the transition probabilities and proba
277
+	hmm->initialize_transition_probs(initial_A, *use_initial_params);
278
+	hmm->initialize_proba(initial_proba, *use_initial_params);
279
+	
280
+	// Print logproba and A
281
+// 	for (int iN=0; iN<*N; iN++)
282
+// 	{
283
+// 		FILE_LOG(logDEBUG) << "proba["<<iN<<"] = " <<exp(hmm->logproba[iN]);
284
+// 		for (int jN=0; jN<*N; jN++)
285
+// 		{
286
+// 			FILE_LOG(logDEBUG) << "A["<<iN<<"]["<<jN<<"] = " << hmm->A[iN][jN];
287
+// 		}
288
+// 	}
289
+
290
+	// Estimate the parameters
291
+	FILE_LOG(logDEBUG1) << "Starting Baum-Welch estimation";
292
+	try
293
+	{
294
+		hmm->baumWelch(maxiter, maxtime, eps);
295
+	}
296
+	catch (std::exception& e)
297
+	{
298
+		FILE_LOG(logERROR) << "Error in Baum-Welch: " << e.what();
299
+		Rprintf("Error in Baum-Welch: %s\n", e.what());
300
+		if (e.what()=="nan detected") { *error = 1; }
301
+		else { *error = 2; }
302
+	}
303
+	FILE_LOG(logDEBUG1) << "Finished with Baum-Welch estimation";
304
+	
305
+// 	// Compute the posteriors and save results directly to the R pointer
306
+// 	FILE_LOG(logDEBUG1) << "Recode posteriors into column representation";
307
+// 	for (int iN=0; iN<*N; iN++)
308
+// 	{
309
+// 		for (int t=0; t<*T; t++)
310
+// 		{
311
+// 			posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t);
312
+// 		}
313
+// 	}
314
+
315
+	// Compute the states from posteriors
316
+	FILE_LOG(logDEBUG1) << "Computing states from posteriors";
317
+	double posterior_per_t [*N];
318
+	for (int t=0; t<*T; t++)
319
+	{
320
+		for (int iN=0; iN<*N; iN++)
321
+		{
322
+			posterior_per_t[iN] = hmm->get_posterior(iN, t);
323
+		}
324
+		states[t] = comb_states[argMax(posterior_per_t, *N)];
325
+	}
326
+	
327
+	FILE_LOG(logDEBUG1) << "Return parameters";
328
+	// also return the estimated transition matrix and the initial probs
329
+	for (int i=0; i<*N; i++)
330
+	{
331
+		proba[i] = hmm->get_proba(i);
332
+		for (int j=0; j<*N; j++)
333
+		{
334
+				A[i * (*N) + j] = hmm->get_A(j,i);
335
+		}
336
+	}
337
+	*loglik = hmm->get_logP();
338
+
339
+	FILE_LOG(logDEBUG1) << "Deleting the hmm";
340
+	delete hmm;
341
+	freeDoubleMatrix(multiD, *N);
342
+}
343
+} // extern C
344
+
... ...
@@ -1382,4 +1382,40 @@ double Geometric::get_prob()
1382 1382
 }
1383 1383
 
1384 1384
 
1385
+// ============================================================
1386
+// Multivariate Copula Approximation
1387
+// ============================================================
1388
+
1389
+// Constructor and Destructor ---------------------------------
1390
+MVCopulaApproximation::MVCopulaApproximation(int** multiobservations, int T, std::vector<Density*> marginals, double* cor_matrix_inv, double cor_matrix_determinant)
1391
+{
1392
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
1393
+	this->multi_obs = multiobservations;
1394
+	this->T = T;
1395
+	// these are the marginal distributions (we need their CDF function)
1396
+	this->marginals = marginals;
1397
+	this->Nmod = this->marginals.size();
1398
+	this->cor_matrix_inv = cor_matrix_inv;
1399
+	this->cor_matrix_determinant = cor_matrix_determinant;
1400
+}
1401
+
1402
+MVCopulaApproximation::~MVCopulaApproximation()
1403
+{
1404
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
1405
+	for (int imod; imod<this->Nmod; imod++)
1406
+	{
1407
+		delete this->marginals[imod];
1408
+	}
1409
+}
1410
+
1411
+// Methods ----------------------------------------------------
1412
+
1413
+// Getter and Setter ------------------------------------------
1414
+DensityName MVCopulaApproximation::get_name()
1415
+{
1416
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
1417
+	return(OTHER);
1418
+}
1419
+
1420
+	
1385 1421
 
... ...
@@ -1,10 +1,12 @@
1 1
 #include <Rmath.h> // dnorm(), dnbinom() and digamma() etc.
2
+#include <vector> // storing density functions in MVCopula
2 3
 #include "utility.h" // FILE_LOG(), intMax()
3 4
 
4 5
 #ifndef DENSITIES_H
5 6
 #define DENSITIES_H
6 7
 
7
-enum DensityName {ZERO_INFLATION, NORMAL, NEGATIVE_BINOMIAL, GEOMETRIC, POISSON, BINOMIAL};
8
+enum whichvariate {UNIVARIATE, MULTIVARIATE};
9
+enum DensityName {ZERO_INFLATION, NORMAL, NEGATIVE_BINOMIAL, GEOMETRIC, POISSON, BINOMIAL, OTHER};
8 10
 
9 11
 class Density
10 12
 {
... ...
@@ -230,4 +232,26 @@ class Geometric : public Density
230 232
 };
231 233
 
232 234
 
235
+class MVCopulaApproximation : public Density {
236
+	public:
237
+		// Constructor and Destructor
238
+		MVCopulaApproximation(int** multiobservations, int T, std::vector<Density*> marginals, double* cor_matrix_inv, double cor_matrix_determinant);
239
+		~MVCopulaApproximation();
240
+	
241
+		// Methods
242
+
243
+		// Getters and Setters
244
+		DensityName get_name();
245
+
246
+	private:
247
+		// Member variables
248
+		int Nmod; ///< number of modifications
249
+		int** multi_obs; ///< matrix [Nmod x T] of observations
250
+		int T; ///< length of observation vector
251
+		std::vector<Density*> marginals; ///< vector [Nmod] of marginal distributions
252
+		double* cor_matrix_inv; ///< vector with elements of the inverse of the correlation matrix
253
+		double cor_matrix_determinant; ///< determinant of the correlation matrix
254
+};
255
+
256
+
233 257
 #endif
... ...
@@ -10,6 +10,8 @@
10 10
 ScaleHMM::ScaleHMM(int T, int N)
11 11
 {
12 12
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
13
+	FILE_LOG(logDEBUG2) << "Initializing univariate ScaleHMM";
14
+	this->xvariate = UNIVARIATE;
13 15
 	this->T = T;
14 16
 	this->N = N;
15 17
 	this->A = allocDoubleMatrix(N, N);
... ...
@@ -30,6 +32,29 @@ ScaleHMM::ScaleHMM(int T, int N)
30 32
 
31 33
 }
32 34
 
35
+
36
+ScaleHMM::ScaleHMM(int T, int N, int Nmod, double** densities)
37
+{
38
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
39
+	FILE_LOG(logDEBUG2) << "Initializing multivariate ScaleHMM";
40
+	this->xvariate = MULTIVARIATE;
41
+	this->T = T;
42
+	this->N = N;
43
+	this->A = allocDoubleMatrix(N, N);
44
+	this->scalefactoralpha = (double*) calloc(T, sizeof(double));
45
+	this->scalealpha = allocDoubleMatrix(T, N);
46
+	this->scalebeta = allocDoubleMatrix(T, N);
47
+	this->densities = densities;
48
+	this->proba = (double*) calloc(N, sizeof(double));
49
+	this->gamma = allocDoubleMatrix(N, T);
50
+	this->sumgamma = (double*) calloc(N, sizeof(double));
51
+	this->sumxi = allocDoubleMatrix(N, N);
52
+	this->logP = -INFINITY;
53
+	this->dlogP = INFINITY;
54
+	this->Nmod = Nmod;
55
+
56
+}
57
+
33 58
 ScaleHMM::~ScaleHMM()
34 59
 {
35 60
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
... ...
@@ -37,16 +62,19 @@ ScaleHMM::~ScaleHMM()
37 62
 	free(this->scalefactoralpha);
38 63
 	freeDoubleMatrix(this->scalealpha, this->T);
39 64
 	freeDoubleMatrix(this->scalebeta, this->T);
40
-	freeDoubleMatrix(this->densities, this->N);
41 65
 // 	freeDoubleMatrix(this->tdensities, this->T);
42 66
 	freeDoubleMatrix(this->gamma, this->N);
43 67
 	freeDoubleMatrix(this->sumxi, this->N);
44 68
 	free(this->proba);
45 69
 	free(this->sumgamma);
46
-	for (int iN=0; iN<this->N; iN++)
70
+	if (this->xvariate == UNIVARIATE)
47 71
 	{
48
-		FILE_LOG(logDEBUG1) << "Deleting density functions"; 
49
-		delete this->densityFunctions[iN];
72
+		freeDoubleMatrix(this->densities, this->N);
73
+		for (int iN=0; iN<this->N; iN++)
74
+		{
75
+			FILE_LOG(logDEBUG1) << "Deleting density functions"; 
76
+			delete this->densityFunctions[iN];
77
+		}
50 78
 	}
51 79
 }
52 80
 
... ...
@@ -79,7 +107,7 @@ void ScaleHMM::initialize_transition_probs(double* initial_A, bool use_initial_p
79 107
 				else
80 108
 					this->A[iN][jN] = other;
81 109
 				// Save value to initial A
82
-				initial_A[iN*this->N + jN] = this->A[jN][iN];
110
+				initial_A[jN*this->N + iN] = this->A[iN][jN];
83 111
 			}
84 112
 		}
85 113
 	}
... ...
@@ -123,10 +151,17 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps)
123 151
 	this->baumWelchStartTime_sec = time(NULL);
124 152
 
125 153
 	// Print some initial information
126
-	FILE_LOG(logINFO) << "";
127
-	FILE_LOG(logINFO) << "INITIAL PARAMETERS";
128
-// 	this->print_uni_params();
129
-	this->print_uni_iteration(0);
154
+	if (this->xvariate == UNIVARIATE)
155
+	{
156
+		FILE_LOG(logINFO) << "";
157
+		FILE_LOG(logINFO) << "INITIAL PARAMETERS";
158
+// 		this->print_uni_params();
159
+		this->print_uni_iteration(0);
160
+	}
161
+	else if (this->xvariate == MULTIVARIATE)
162
+	{
163
+		this->print_multi_iteration(0);
164
+	}
130 165
 
131 166
 	R_CheckUserInterrupt();
132 167
 
... ...
@@ -137,9 +172,12 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps)
137 172
 
138 173
 		iteration++;
139 174
 		
140
-		FILE_LOG(logDEBUG1) << "Calling calc_densities() from baumWelch()";
141
-		try { this->calc_densities(); } catch(...) { throw; }
142
-		R_CheckUserInterrupt();
175
+		if (this->xvariate == UNIVARIATE)
176
+		{
177
+			FILE_LOG(logDEBUG1) << "Calling calc_densities() from baumWelch()";
178
+			try { this->calc_densities(); } catch(...) { throw; }
179
+			R_CheckUserInterrupt();
180
+		}
143 181
 
144 182
 		FILE_LOG(logDEBUG1) << "Calling forward() from baumWelch()";
145 183
 		try { this->forward(); } catch(...) { throw; }
... ...
@@ -167,26 +205,36 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps)
167 205
 		this->calc_sumgamma();
168 206
 		R_CheckUserInterrupt();
169 207
 
170
-// 		clock_t clocktime = clock(), dtime;
171
-		// difference in posterior
172
-		FILE_LOG(logDEBUG1) << "Calculating differences in posterior in baumWelch()";
173
-		double postsum = 0.0;
174
-		for (int t=0; t<this->T; t++)
208
+		if (this->xvariate == UNIVARIATE)
175 209
 		{
176
-			for (int iN=0; iN<this->N; iN++)
210
+// 			clock_t clocktime = clock(), dtime;
211
+			// difference in posterior
212
+			FILE_LOG(logDEBUG1) << "Calculating differences in posterior in baumWelch()";
213
+			double postsum = 0.0;
214
+			for (int t=0; t<this->T; t++)
177 215
 			{
178
-				postsum += fabs(this->gamma[iN][t] - gammaold[iN][t]);
179
-				gammaold[iN][t] = this->gamma[iN][t];
216
+				for (int iN=0; iN<this->N; iN++)
217
+				{
218
+					postsum += fabs(this->gamma[iN][t] - gammaold[iN][t]);
219
+					gammaold[iN][t] = this->gamma[iN][t];
220
+				}
180 221
 			}
222
+			this->sumdiff_posterior = postsum;
223
+// 			dtime = clock() - clocktime;
224
+// 			FILE_LOG(logDEBUG) << "differences in posterior: " << dtime << " clicks";
181 225
 		}
182
-		this->sumdiff_posterior = postsum;
183
-// 		dtime = clock() - clocktime;
184
-// 		FILE_LOG(logDEBUG) << "differences in posterior: " << dtime << " clicks";
185 226
 
186 227
 		R_CheckUserInterrupt();
187 228
 
188 229
 		// Print information about current iteration
189
-		this->print_uni_iteration(iteration);
230
+		if (this->xvariate == UNIVARIATE)
231
+		{
232
+			this->print_uni_iteration(iteration);
233
+		}
234
+		else if (this->xvariate == MULTIVARIATE)
235
+		{
236
+			this->print_multi_iteration(iteration);
237
+		}
190 238
 
191 239
 		// Check convergence
192 240
 		if(this->dlogP < *eps) //it has converged
... ...
@@ -254,81 +302,51 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps)
254 302
 			}
255 303
 		}
256 304
 
257
-		clock_t clocktime = clock(), dtime;
258
-
259
-// 		// Update all distributions independantly
260
-// 		for (int iN=0; iN<this->N; iN++)
261
-// 		{
262
-// 			this->densityFunctions[iN]->update(this->gamma[iN]);
263
-// 		}
264
-
265
-		// Update distribution of state 'null-mixed' and 'monosomic', set others as multiples of 'monosomic'
266
-		// This loop assumes that the negative binomial states come last and are consecutive
267
-		int xsomy = 1;
268
-		for (int iN=0; iN<this->N; iN++)
305
+		if (this->xvariate == UNIVARIATE)
269 306
 		{
270
-			if (this->densityFunctions[iN]->get_name() == ZERO_INFLATION) {}
271
-			if (this->densityFunctions[iN]->get_name() == GEOMETRIC)
272
-			{
273
-				this->densityFunctions[iN]->update(this->gamma[iN]);
274
-			}
275
-			if (this->densityFunctions[iN]->get_name() == NEGATIVE_BINOMIAL)
307
+// 			clock_t clocktime = clock(), dtime;
308
+// 
309
+// 			// Update all distributions independantly
310
+// 			for (int iN=0; iN<this->N; iN++)
311
+// 			{
312
+// 				this->densityFunctions[iN]->update(this->gamma[iN]);
313
+// 			}
314
+
315
+			// Update distribution of state 'null-mixed' and 'monosomic', set others as multiples of 'monosomic'
316
+			// This loop assumes that the negative binomial states come last and are consecutive
317
+			int xsomy = 1;
318
+			for (int iN=0; iN<this->N; iN++)
276 319
 			{
277
-				if (xsomy==1)
320
+				if (this->densityFunctions[iN]->get_name() == ZERO_INFLATION) {}
321
+				if (this->densityFunctions[iN]->get_name() == GEOMETRIC)
322
+				{
323
+					this->densityFunctions[iN]->update(this->gamma[iN]);
324
+				}
325
+				if (this->densityFunctions[iN]->get_name() == NEGATIVE_BINOMIAL)
278 326
 				{
279
-					FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance();
280
-					this->densityFunctions[iN]->update_constrained(this->gamma, iN, this->N);
281
-					double mean1 = this->densityFunctions[iN]->get_mean();
282
-					double variance1 = this->densityFunctions[iN]->get_variance();
283
-					FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance();
284
-					// Set others as multiples
285
-					for (int jN=iN+1; jN<this->N; jN++)
327
+					if (xsomy==1)
286 328
 					{
287
-						this->densityFunctions[jN]->set_mean(mean1 * (jN-iN+1));
288
-						this->densityFunctions[jN]->set_variance(variance1 * (jN-iN+1));
289
-						FILE_LOG(logDEBUG1) << "mean(state="<<jN<<") = " << this->densityFunctions[jN]->get_mean() << ", var(state="<<jN<<") = " << this->densityFunctions[jN]->get_variance();
329
+						FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance();
330
+						this->densityFunctions[iN]->update_constrained(this->gamma, iN, this->N);
331
+						double mean1 = this->densityFunctions[iN]->get_mean();
332
+						double variance1 = this->densityFunctions[iN]->get_variance();
333
+						FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance();
334
+						// Set others as multiples
335
+						for (int jN=iN+1; jN<this->N; jN++)
336
+						{
337
+							this->densityFunctions[jN]->set_mean(mean1 * (jN-iN+1));
338
+							this->densityFunctions[jN]->set_variance(variance1 * (jN-iN+1));
339
+							FILE_LOG(logDEBUG1) << "mean(state="<<jN<<") = " << this->densityFunctions[jN]->get_mean() << ", var(state="<<jN<<") = " << this->densityFunctions[jN]->get_variance();
340
+						}
341
+						break;
290 342
 					}
291
-					break;
343
+					xsomy++;
292 344
 				}
293
-				xsomy++;
294 345
 			}
346
+// 			dtime = clock() - clocktime;
347
+// 			FILE_LOG(logDEBUG) << "updating distributions: " << dtime << " clicks";
348
+			R_CheckUserInterrupt();
295 349
 		}
296
-			
297
-// 		// Update distribution of state 'null-mixed' and 'disomic', set others as multiples of 'disomic'/2
298
-// 		// This loop assumes that the negative binomial states come last and are consecutive
299
-// 		int xsomy = 1;
300
-// 		for (int iN=0; iN<this->N; iN++)
301
-// 		{
302
-// 			if (this->densityFunctions[iN]->get_name() == ZERO_INFLATION) {}
303
-// 			if (this->densityFunctions[iN]->get_name() == GEOMETRIC)
304
-// 			{
305
-// 				this->densityFunctions[iN]->update(this->gamma[iN]);
306
-// 			}
307
-// 			if (this->densityFunctions[iN]->get_name() == NEGATIVE_BINOMIAL)
308
-// 			{
309
-// 				if (xsomy==2)
310
-// 				{
311
-// 					FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance();
312
-// 					this->densityFunctions[iN]->update(this->gamma[iN]);
313
-// 					double mean1 = this->densityFunctions[iN]->get_mean() / xsomy;
314
-// 					double variance1 = this->densityFunctions[iN]->get_variance() / xsomy;
315
-// 					FILE_LOG(logDEBUG1) << "mean(state="<<iN<<") = " << this->densityFunctions[iN]->get_mean() << ", var(state="<<iN<<") = " << this->densityFunctions[iN]->get_variance();
316
-// 					// Set others as multiples
317
-// 					for (int jN=2; jN<this->N; jN++)
318
-// 					{
319
-// 						this->densityFunctions[jN]->set_mean(mean1 * (jN-iN+2));
320
-// 						this->densityFunctions[jN]->set_variance(variance1 * (jN-iN+2));
321
-// 						FILE_LOG(logDEBUG1) << "mean(state="<<jN<<") = " << this->densityFunctions[jN]->get_mean() << ", var(state="<<jN<<") = " << this->densityFunctions[jN]->get_variance();
322
-// 					}
323
-// 					break;
324
-// 				}
325
-// 				xsomy++;
326
-// 			}
327
-// 		}
328
-			
329
-		dtime = clock() - clocktime;
330
-	 	FILE_LOG(logDEBUG) << "updating distributions: " << dtime << " clicks";
331
-		R_CheckUserInterrupt();
332 350
 
333 351
 	} /* main loop end */
334 352
     
... ...
@@ -338,7 +356,7 @@ void ScaleHMM::baumWelch(int* maxiter, int* maxtime, double* eps)
338 356
 	FILE_LOG(logINFO) << "FINAL ESTIMATION RESULTS";
339 357
 // 	this->print_uni_params();
340 358
 
341
-	/* free memory */
359
+	// free memory
342 360
 	freeDoubleMatrix(gammaold, this->N);
343 361
 
344 362
 	// Return values
... ...
@@ -816,6 +834,37 @@ void ScaleHMM::print_uni_iteration(int iteration)
816 834
 	R_FlushConsole();
817 835
 }
818 836
 
837
+void ScaleHMM::print_multi_iteration(int iteration)
838
+{
839
+	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
840
+	this->baumWelchTime_real = difftime(time(NULL),this->baumWelchStartTime_sec);
841
+	int bs = 86;
842
+	char buffer [bs];
843
+	if (iteration % 20 == 0)
844
+	{
845
+		snprintf(buffer, bs, "%10s%20s%20s%15s", "Iteration", "log(P)", "dlog(P)", "Time in sec");
846
+		FILE_LOG(logITERATION) << buffer;
847
+		Rprintf("%s\n", buffer);
848
+	}
849
+	if (iteration == 0)
850
+	{
851
+		snprintf(buffer, bs, "%10s%20s%20s%*d", "0", "-inf", "-", 15, this->baumWelchTime_real);
852
+	}
853
+	else if (iteration == 1)
854
+	{
855
+		snprintf(buffer, bs, "%*d%*f%20s%*d", 10, iteration, 20, this->logP, "inf", 15, this->baumWelchTime_real);
856
+	}
857
+	else
858
+	{
859
+		snprintf(buffer, bs, "%*d%*f%*f%*d", 10, iteration, 20, this->logP, 20, this->dlogP, 15, this->baumWelchTime_real);
860
+	}
861
+	FILE_LOG(logITERATION) << buffer;
862
+	Rprintf("%s\n", buffer);
863
+
864
+	// Flush Rprintf statements to R console
865
+	R_FlushConsole();
866
+}
867
+
819 868
 void ScaleHMM::print_uni_params()
820 869
 {
821 870
 	FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__;
... ...
@@ -13,6 +13,7 @@ class ScaleHMM  {
13 13
 	public:
14 14
 		// Constructor and Destructor
15 15
 		ScaleHMM(int T, int N);
16
+		ScaleHMM(int T, int N, int Nmod, double** densities);
16 17
 		~ScaleHMM();
17 18
 
18 19
 		// Member variables
... ...
@@ -38,6 +39,7 @@ class ScaleHMM  {
38 39
 		int T; ///< length of observed sequence
39 40
 		int* obs; ///< vector [T] of observations
40 41
 		int N; ///< number of states
42
+		int Nmod; ///< number of modifications / marks
41 43
 		int cutoff; ///< a cutoff for observations
42 44
 		double* sumgamma; ///< vector[N] of sum of posteriors (gamma values)
43 45
 		double** sumxi; ///< matrix[N x N] of xi values
... ...
@@ -56,6 +58,7 @@ class ScaleHMM  {
56 58
 		int sumdiff_state_last; ///< sum of the difference in the state 1 assignments from one iteration to the next
57 59
 		double sumdiff_posterior; ///< sum of the difference in posterior (gamma) values from one iteration to the next
58 60
 // 		bool use_tdens; ///< switch for using the tdensities in the calculations
61
+		whichvariate xvariate; ///< enum which stores if UNIVARIATE or MULTIVARIATE
59 62
 
60 63
 		// Methods
61 64
 		void forward(); ///< calculate forward variables (alpha)
... ...
@@ -65,6 +68,7 @@ class ScaleHMM  {
65 68
 		void calc_loglikelihood();
66 69
 		void calc_densities();
67 70
 		void print_uni_iteration(int iteration);
71
+		void print_multi_iteration(int iteration);
68 72
 		void print_uni_params();
69 73
 };
70 74