Browse code

univariate.findCNVs with stepsize

chakalakka authored on 17/05/2017 11:38:38
Showing 13 changed files

... ...
@@ -62,10 +62,9 @@ blacklist <- function(files, assembly, bins, min.mapq=10, pairedEndReads=FALSE)
62 62
 #' @inheritParams bam2GRanges
63 63
 #' @inheritParams bed2GRanges
64 64
 #' @return A \code{\link{GRanges}} object with reads.
65
-#' @examples 
66
-#' files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE)
67
-#' reads <- mergeStrandseqFiles(files, assembly='hg38')
68
-#' 
65
+#' @examples
66
+#' #files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE)
67
+#' #reads <- mergeStrandseqFiles(files, assembly='hg38')
69 68
 mergeStrandseqFiles <- function(files, assembly, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000) {
70 69
   
71 70
   	## Determine format
... ...
@@ -15,7 +15,7 @@
15 15
 #'@examples
16 16
 #'## Get an example BED file with single-cell-sequencing reads
17 17
 #'bedfile <- system.file("extdata", "KK150311_VI_07.bam.bed.gz", package="AneuFinderData")
18
-#'## Bin the BAM file into bin size 1Mp
18
+#'## Bin the data into bin size 1Mp
19 19
 #'binned <- binReads(bedfile, assembly='mm10', binsize=1e6,
20 20
 #'                   chromosomes=c(1:19,'X','Y'))
21 21
 #'## Fit the Hidden Markov Model
... ...
@@ -26,7 +26,7 @@
26 26
 findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=15, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation",paste0(0:10,"-somy")), most.frequent.state="2-somy", method="HMM", algorithm="EM", initial.params=NULL) {
27 27
 
28 28
 	## Intercept user input
29
-  binned.data <- loadFromFiles(binned.data, check.class='GRanges')[[1]]
29
+  binned.data <- loadFromFiles(binned.data, check.class=c('GRanges', 'GRangesList'))[[1]]
30 30
 	if (is.null(ID)) {
31 31
 		ID <- attr(binned.data, 'ID')
32 32
 	}
... ...
@@ -63,7 +63,7 @@ findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1
63 63
 #'
64 64
 #' \code{univariate.findCNVs} classifies the binned read counts into several states which represent copy-number-variation.
65 65
 #'
66
-#' @param binned.data A \link{GRanges} object with binned read counts.
66
+#' @param binned.data A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.
67 67
 #' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.
68 68
 #' @param eps Convergence threshold for the Baum-Welch algorithm.
69 69
 #' @param init One of the following initialization procedures:
... ...
@@ -90,10 +90,16 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
90 90
 	on.exit(.C("C_univariate_cleanup", PACKAGE = 'AneuFinder'))
91 91
 
92 92
 	## Intercept user input
93
-  binned.data <- loadFromFiles(binned.data, check.class='GRanges')[[1]]
93
+  binned.data <- loadFromFiles(binned.data, check.class=c('GRanges', 'GRangesList'))[[1]]
94 94
 	if (is.null(ID)) {
95 95
 		ID <- attr(binned.data, 'ID')
96 96
 	}
97
+  if (class(binned.data) == 'GRangesList') {
98
+    binned.data.list <- binned.data
99
+    binned.data <- binned.data.list[[1]]
100
+  } else if (class(binned.data) == 'GRanges') {
101
+    binned.data.list <- GRangesList('0'=binned.data)
102
+  }
97 103
 	if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric")
98 104
 	if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer")
99 105
 	if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer")
... ...
@@ -125,6 +131,17 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
125 131
 	warlist <- list()
126 132
 	if (num.trials==1) eps.try <- eps
127 133
 
134
+	### Make return object
135
+		result <- list()
136
+		class(result) <- class.univariate.hmm
137
+		result$ID <- ID
138
+		result$bins <- binned.data.list
139
+	## Quality info
140
+		result$qualityInfo <- as.list(getQC(binned.data.list))
141
+	## Convergence info
142
+		convergenceInfo <- list(eps=eps, loglik=NA, loglik.delta=NA, num.iterations=NA, time.sec=NA, error=NA)
143
+		result$convergenceInfo <- convergenceInfo
144
+
128 145
 	## Assign variables
129 146
 	inistates <- initializeStates(states)
130 147
 	state.labels <- inistates$states
... ...
@@ -132,8 +149,6 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
132 149
 	multiplicity <- inistates$multiplicity
133 150
 	dependent.states.mask <- (state.labels != 'zero-inflation') & (state.labels != '0-somy')
134 151
 	numstates <- length(states)
135
-	numbins <- length(binned.data)
136
-	iniproc <- which(init==c("standard","random")) # transform to int
137 152
 	if (strand=='+') {
138 153
 		select <- 'pcounts'
139 154
 	} else if (strand=='-') {
... ...
@@ -141,321 +156,391 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
141 156
 	} else if (strand=='*') {
142 157
 		select <- 'counts'
143 158
 	}
144
-	counts <- mcols(binned.data)[,select]
145 159
 	algorithm <- factor(algorithm, levels=c('baumWelch','viterbi','EM'))
146
-
147
-	### Make return object
148
-		result <- list()
149
-		class(result) <- class.univariate.hmm
150
-		result$ID <- ID
151
-		result$bins <- binned.data
152
-	## Quality info
153
-		result$qualityInfo <- as.list(getQC(binned.data))
154
-	## Convergence info
155
-		convergenceInfo <- list(eps=eps, loglik=NA, loglik.delta=NA, num.iterations=NA, time.sec=NA, error=NA)
156
-		result$convergenceInfo <- convergenceInfo
157
-
158
-	# Check if there are counts in the data, otherwise HMM will blow up
159
-	if (any(is.na(counts))) {
160
-		stop(paste0("ID = ",ID,": NAs found in reads."))
161
-	}
162
-	if (!any(counts!=0)) {
163
-		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done."))
164
-		result$warnings <- warlist
165
-		return(result)
166
-	} else if (any(counts<0)) {
167
-		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done."))
168
-		result$warnings <- warlist
169
-		return(result)
170
-	}
171
-		
172
-
173
-	# Filter high counts out, makes HMM faster
174
-	count.cutoff <- quantile(counts, count.cutoff.quantile)
175
-	names.count.cutoff <- names(count.cutoff)
176
-	count.cutoff <- ceiling(count.cutoff)
177
-	mask <- counts > count.cutoff
178
-	counts[mask] <- count.cutoff
179
-	numfiltered <- length(which(mask))
180
-	if (numfiltered > 0) {
181
-		message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to enhance performance."))
182
-	}
183 160
 	
184
-	## Call univariate in a for loop to enable multiple trials
185
-	modellist <- list()
186
-	for (i_try in 1:num.trials) {
187
-		message(paste0("Trial ",i_try," / ",num.trials))
188
-
189
-		## Initial parameters
190
-		if (init == 'initial.params') {
191
-			A.initial <- initial.params$transitionProbs
192
-			proba.initial <- initial.params$startProbs
193
-			size.initial <- initial.params$distributions[,'size']
194
-			prob.initial <- initial.params$distributions[,'prob']
195
-			size.initial[is.na(size.initial)] <- 0
196
-			prob.initial[is.na(prob.initial)] <- 0
197
-		} else if (init == 'random') {
198
-			A.initial <- matrix(stats::runif(numstates^2), ncol=numstates)
199
-			A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/")			
200
-			proba.initial <- stats::runif(numstates)
201
-			# Distributions for dependent states
202
-			size.initial <- stats::runif(1, min=0, max=100) * cumsum(dependent.states.mask)
203
-			prob.initial <- stats::runif(1) * dependent.states.mask
204
-			# Assign initials for the 0-somy distribution
205
-			index <- which('0-somy'==state.labels)
206
-			size.initial[index] <- 1
207
-			prob.initial[index] <- 0.5
208
-		} else if (init == 'standard') {
209
-			A.initial <- matrix(NA, ncol=numstates, nrow=numstates)
210
-			for (irow in 1:numstates) {
211
-				for (icol in 1:numstates) {
212
-					if (irow==icol) { A.initial[irow,icol] <- 0.9 }
213
-					else { A.initial[irow,icol] <- 0.1/(numstates-1) }
214
-				}
215
-			}
216
-			proba.initial <- rep(1/numstates, numstates)
217
-			## Set initial mean of most.frequent.state distribution to max of count histogram
218
-			max.counts <- as.integer(names(which.max(table(counts[counts>0]))))
219
-			divf <- max(multiplicity[most.frequent.state], 1)
220
-			mean.initial.monosomy <- max.counts/divf
221
-			var.initial.monosomy <- mean.initial.monosomy * 2
222
-# 			mean.initial.monosomy <- mean(counts[counts>0])/divf
223
-# 			var.initial.monosomy <- var(counts[counts>0])/divf
224
-			if (is.na(mean.initial.monosomy)) {
225
-				mean.initial.monosomy <- 1
226
-			}
227
-			if (is.na(var.initial.monosomy)) {
228
-				var.initial.monosomy <- mean.initial.monosomy + 1
229
-			}
230
-			if (mean.initial.monosomy >= var.initial.monosomy) {
231
-				mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask)
232
-				var.initial <- (mean.initial.monosomy+1) * cumsum(dependent.states.mask)
233
-				size.initial <- rep(0,numstates)
234
-				prob.initial <- rep(0,numstates)
235
-				mask <- dependent.states.mask
236
-				size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
237
-				prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
238
-			} else {
239
-				mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask)
240
-				var.initial <- var.initial.monosomy * cumsum(dependent.states.mask)
241
-				size.initial <- rep(0,numstates)
242
-				prob.initial <- rep(0,numstates)
243
-				mask <- dependent.states.mask
244
-				size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
245
-				prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
246
-			}
247
-			# Assign initials for the 0-somy distribution
248
-			index <- which('0-somy'==state.labels)
249
-			size.initial[index] <- 1
250
-			prob.initial[index] <- 0.5
251
-		}
252
-	
253
-		hmm <- .C("C_univariate_hmm",
254
-			counts = as.integer(counts), # int* O
255
-			num.bins = as.integer(numbins), # int* T
256
-			num.states = as.integer(numstates), # int* N
257
-			state.labels = as.integer(state.labels), # int* state_labels
258
-			size = double(length=numstates), # double* size
259
-			prob = double(length=numstates), # double* prob
260
-			num.iterations = as.integer(max.iter), #  int* maxiter
261
-			time.sec = as.integer(max.time), # double* maxtime
262
-			loglik.delta = as.double(eps.try), # double* eps
263
-			states = integer(length=numbins), # int* states
264
-			A = double(length=numstates*numstates), # double* A
265
-			proba = double(length=numstates), # double* proba
266
-			loglik = double(length=1), # double* loglik
267
-			weights = double(length=numstates), # double* weights
268
-			distr.type = as.integer(state.distributions), # int* distr_type
269
-			size.initial = as.vector(size.initial), # double* initial_size
270
-			prob.initial = as.vector(prob.initial), # double* initial_prob
271
-			A.initial = as.vector(A.initial), # double* initial_A
272
-			proba.initial = as.vector(proba.initial), # double* initial_proba
273
-			use.initial.params = as.logical(1), # bool* use_initial_params
274
-			num.threads = as.integer(num.threads), # int* num_threads
275
-			error = as.integer(0), # int* error (error handling)
276
-			count.cutoff = as.integer(count.cutoff), # int* count.cutoff
277
-			algorithm = as.integer(algorithm), # int* algorithm
278
-			PACKAGE = 'AneuFinder'
279
-		)
280
-
281
-		hmm$eps <- eps.try
282
-		if (num.trials > 1) {
283
-			if (hmm$loglik.delta > hmm$eps) {
284
-				warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge in trial run ",i_try,"!\n"))
285
-			}
286
-			# Store model in list
287
-			hmm$counts <- NULL
288
-			modellist[[as.character(i_try)]] <- hmm
289
-			init <- 'random'
290
-		} else if (num.trials == 1) {
291
-			if (hmm$loglik.delta > eps) {
292
-				warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge!\n"))
293
-			}
294
-		}
295
-	}
296
-
297
-	if (num.trials > 1) {
298
-
299
-		# Mathematically we should select the fit with highest loglikelihood. If we think the fit with the highest loglikelihood is incorrect, we should change the underlying model. However, this is very complex and we choose to select a fit that we think is (more) correct, although it has not the highest support given our (imperfect) model.
300
-		if (length(modellist)>1) {
301
-# 		  ## Get some metrics
302
-# 		  num.segs <- sapply(modellist, function(x) { length(rle(x$states)$values) })
303
-# 			logliks <- sapply(modellist,'[[','loglik')
304
-# 			bhattacharyya <- numeric(length=length(modellist))
305
-# 			sos <- numeric(length=length(modellist))
306
-# 			diff.most.frequent <- numeric(length=length(modellist))
307
-# 			ind.1somy <- which(state.labels=='1-somy')
308
-# 			for (i1 in 1:length(modellist)) {
309
-# 			  hmm <- modellist[[i1]]
310
-#   			x <- 0:max(max(counts), 500)
311
-#   			bhattacharyya[i1] <- -log(sum(sqrt(stats::dnbinom(x, size=hmm$size[ind.1somy], prob=hmm$prob[ind.1somy]) * stats::dnbinom(x, size=hmm$size[ind.1somy+1], prob=hmm$prob[ind.1somy+1]))))
312
-#   			sos[i1] <- sum( (counts - dnbinom.mean(size=hmm$size, prob=hmm$prob)[hmm$states]) ^ 2 )
313
-#   			diff.most.frequent[i1] <- 1 - hmm$weights[state.labels==most.frequent.state]
314
-# 			}
315
-# 			names(bhattacharyya) <- 1:length(modellist)
316
-# 			names(sos) <- 1:length(modellist)
317
-# 			names(diff.most.frequent) <- 1:length(modellist)
318
-# 			# Diff to most.frequen.state
319
-# 			df.select <- data.frame(num.seg=num.segs, loglik=logliks, bhattacharyya=bhattacharyya, sos=sos, diff.most.frequent=diff.most.frequent)
320
-# 			## Get probability of finding value
321
-# 			num.segs <- ecdf(num.segs)(num.segs)
322
-# 			logliks <- 1-ecdf(logliks)(logliks)
323
-# 			bhattacharyya <- 1-ecdf(bhattacharyya)(bhattacharyya)
324
-# 			sos <- ecdf(sos)(sos)
325
-# 			diff.most.frequent <- ecdf(diff.most.frequent)(diff.most.frequent)
326
-# 			df.order <- data.frame(num.seg=num.segs, loglik=logliks, bhattacharyya=bhattacharyya, sos=sos, diff.most.frequent=diff.most.frequent)
327
-# 			## Get sum for each tuple
328
-# 			ind <- rowSums(df.order)
329
-# 			## Select the one that has lowest sum
330
-# 			index2use <- which.min(ind)
331
-# 			result$dforder <- df.order
332
-# 			result$dfselect <- df.select
333
-# 			result$dfi <- index2use
334
-		  
335
-			## Select models where weight of most.frequent.state is at least half of that of actual most frequent state, then select model with highest loglik
336
-			logliks <- sapply(modellist,'[[','loglik')
337
-			df.weight <- as.data.frame(lapply(modellist, '[[', 'weights'))
338
-			names(df.weight) <- 1:length(modellist)
339
-			rownames(df.weight) <- state.labels
340
-			models2use <- df.weight[most.frequent.state,] / apply(df.weight, 2, max) > 0.5
341
-			models2use[is.na(models2use)] <- FALSE
342
-			if (any(models2use)) {
343
-				index2use <- names(which.max(logliks[models2use]))
344
-			} else {
345
-				index2use <- names(which.max(logliks))
346
-			}
347
-		} else {
348
-			index2use <- 1
349
-		}
350
-		hmm <- modellist[[index2use]]
351
-
352
-		# Check if size and prob parameter are correct
353
-		if (any(is.na(hmm$size) | is.nan(hmm$size) | is.infinite(hmm$size) | is.na(hmm$prob) | is.nan(hmm$prob) | is.infinite(hmm$prob))) {
354
-			hmm$error <- 3
355
-		} else {
356
-
357
-			# Rerun the HMM with different epsilon and initial parameters from trial run
358
-			message(paste0("Rerunning trial ",index2use," with eps = ",eps))
359
-			hmm <- .C("C_univariate_hmm",
360
-				counts = as.integer(counts), # int* O
361
-				num.bins = as.integer(numbins), # int* T
362
-				num.states = as.integer(numstates), # int* N
363
-				state.labels = as.integer(state.labels), # int* state_labels
364
-				size = double(length=numstates), # double* size
365
-				prob = double(length=numstates), # double* prob
366
-				num.iterations = as.integer(max.iter), #  int* maxiter
367
-				time.sec = as.integer(max.time), # double* maxtime
368
-				loglik.delta = as.double(eps), # double* eps
369
-				states = integer(length=numbins), # int* states
370
-				A = double(length=numstates*numstates), # double* A
371
-				proba = double(length=numstates), # double* proba
372
-				loglik = double(length=1), # double* loglik
373
-				weights = double(length=numstates), # double* weights
374
-				distr.type = as.integer(state.distributions), # int* distr_type
375
-				size.initial = as.vector(hmm$size), # double* initial_size
376
-				prob.initial = as.vector(hmm$prob), # double* initial_prob
377
-				A.initial = as.vector(hmm$A), # double* initial_A
378
-				proba.initial = as.vector(hmm$proba), # double* initial_proba
379
-				use.initial.params = as.logical(1), # bool* use_initial_params
380
-				num.threads = as.integer(num.threads), # int* num_threads
381
-				error = as.integer(0), # int* error (error handling)
382
-				count.cutoff = as.integer(count.cutoff), # int* count.cutoff
383
-				algorithm = as.integer(algorithm), # int* algorithm
384
-				PACKAGE = 'AneuFinder'
385
-			)
386
-		}
387
-
388
-	}
161
+  ### Arrays for finding maximum posterior for each bin between offsets
162
+  ## Make bins with offset
163
+  ptm <- startTimedMessage("Making bins with offsets ...")
164
+  if (length(binned.data.list) > 1) {
165
+    stepbins <- disjoin(unlist(binned.data.list, use.names=FALSE))
166
+  } else {
167
+    stepbins <- binned.data.list[[1]]
168
+    mcols(stepbins) <- NULL
169
+  }
170
+  amaxPosterior.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset'))) # to store maximum posterior for current and max-of-previous offsets
171
+  astates.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset'))) # to store states for current and max-of-previous offsets
172
+  stopTimedMessage(ptm)
173
+  
174
+  ### Loop over offsets ###
175
+  for (istep in 1:length(binned.data.list)) {
176
+    binned.data <- binned.data.list[[istep]]
177
+  	numbins <- length(binned.data)
178
+  	counts <- mcols(binned.data)[,select]
179
+    if (istep > 1) {
180
+      ptm.offset <- startTimedMessage("Obtaining states for step = ", istep, " ...")
181
+      ## Run only one iteration (no updating) if we are already over istep==1
182
+      initial.params <- result
183
+      init <- 'initial.params'
184
+    	algorithm <- factor('baumWelch', levels=c('baumWelch','viterbi','EM'))
185
+      num.trials <- 1
186
+    }
389 187
 
188
+  	# Check if there are counts in the data, otherwise HMM will blow up
189
+  	if (any(is.na(counts))) {
190
+  		stop(paste0("ID = ",ID,": NAs found in reads."))
191
+  	}
192
+  	if (!any(counts!=0)) {
193
+  		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done."))
194
+  		result$warnings <- warlist
195
+  		return(result)
196
+  	} else if (any(counts<0)) {
197
+  		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done."))
198
+  		result$warnings <- warlist
199
+  		return(result)
200
+  	}
201
+  		
202
+  	# Filter high counts out, makes HMM faster
203
+  	count.cutoff <- quantile(counts, count.cutoff.quantile)
204
+  	names.count.cutoff <- names(count.cutoff)
205
+  	count.cutoff <- ceiling(count.cutoff)
206
+  	mask <- counts > count.cutoff
207
+  	counts[mask] <- count.cutoff
208
+  	numfiltered <- length(which(mask))
209
+  	if (numfiltered > 0 & istep == 1) {
210
+  		message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to enhance performance."))
211
+  	}
212
+  	
213
+  	## Call univariate in a for loop to enable multiple trials
214
+  	modellist <- list()
215
+  	for (i_try in 1:num.trials) {
216
+  		message(paste0("Trial ",i_try," / ",num.trials))
217
+  
218
+  		## Initial parameters
219
+  		if (init == 'initial.params') {
220
+  			A.initial <- initial.params$transitionProbs
221
+  			proba.initial <- initial.params$startProbs
222
+  			size.initial <- initial.params$distributions[,'size']
223
+  			prob.initial <- initial.params$distributions[,'prob']
224
+  			size.initial[is.na(size.initial)] <- 0
225
+  			prob.initial[is.na(prob.initial)] <- 0
226
+  		} else if (init == 'random') {
227
+  			A.initial <- matrix(stats::runif(numstates^2), ncol=numstates)
228
+  			A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/")			
229
+  			proba.initial <- stats::runif(numstates)
230
+  			# Distributions for dependent states
231
+  			size.initial <- stats::runif(1, min=0, max=100) * cumsum(dependent.states.mask)
232
+  			prob.initial <- stats::runif(1) * dependent.states.mask
233
+  			# Assign initials for the 0-somy distribution
234
+  			index <- which('0-somy'==state.labels)
235
+  			size.initial[index] <- 1
236
+  			prob.initial[index] <- 0.5
237
+  		} else if (init == 'standard') {
238
+  			A.initial <- matrix(NA, ncol=numstates, nrow=numstates)
239
+  			for (irow in 1:numstates) {
240
+  				for (icol in 1:numstates) {
241
+  					if (irow==icol) { A.initial[irow,icol] <- 0.9 }
242
+  					else { A.initial[irow,icol] <- 0.1/(numstates-1) }
243
+  				}
244
+  			}
245
+  			proba.initial <- rep(1/numstates, numstates)
246
+  			## Set initial mean of most.frequent.state distribution to max of count histogram
247
+  			max.counts <- as.integer(names(which.max(table(counts[counts>0]))))
248
+  			divf <- max(multiplicity[most.frequent.state], 1)
249
+  			mean.initial.monosomy <- max.counts/divf
250
+  			var.initial.monosomy <- mean.initial.monosomy * 2
251
+  # 			mean.initial.monosomy <- mean(counts[counts>0])/divf
252
+  # 			var.initial.monosomy <- var(counts[counts>0])/divf
253
+  			if (is.na(mean.initial.monosomy)) {
254
+  				mean.initial.monosomy <- 1
255
+  			}
256
+  			if (is.na(var.initial.monosomy)) {
257
+  				var.initial.monosomy <- mean.initial.monosomy + 1
258
+  			}
259
+  			if (mean.initial.monosomy >= var.initial.monosomy) {
260
+  				mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask)
261
+  				var.initial <- (mean.initial.monosomy+1) * cumsum(dependent.states.mask)
262
+  				size.initial <- rep(0,numstates)
263
+  				prob.initial <- rep(0,numstates)
264
+  				mask <- dependent.states.mask
265
+  				size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
266
+  				prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
267
+  			} else {
268
+  				mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask)
269
+  				var.initial <- var.initial.monosomy * cumsum(dependent.states.mask)
270
+  				size.initial <- rep(0,numstates)
271
+  				prob.initial <- rep(0,numstates)
272
+  				mask <- dependent.states.mask
273
+  				size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask])
274
+  				prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask])
275
+  			}
276
+  			# Assign initials for the 0-somy distribution
277
+  			index <- which('0-somy'==state.labels)
278
+  			size.initial[index] <- 1
279
+  			prob.initial[index] <- 0.5
280
+  		}
281
+  	
282
+  		hmm <- .C("C_univariate_hmm",
283
+  			counts = as.integer(counts), # int* O
284
+  			num.bins = as.integer(numbins), # int* T
285
+  			num.states = as.integer(numstates), # int* N
286
+  			state.labels = as.integer(state.labels), # int* state_labels
287
+  			size = double(length=numstates), # double* size
288
+  			prob = double(length=numstates), # double* prob
289
+  			num.iterations = as.integer(max.iter), #  int* maxiter
290
+  			time.sec = as.integer(max.time), # double* maxtime
291
+  			loglik.delta = as.double(eps.try), # double* eps
292
+  			maxPosterior = double(length=numbins), # double* maxPosterior
293
+  			states = integer(length=numbins), # int* states
294
+  			A = double(length=numstates*numstates), # double* A
295
+  			proba = double(length=numstates), # double* proba
296
+  			loglik = double(length=1), # double* loglik
297
+  			weights = double(length=numstates), # double* weights
298
+  			distr.type = as.integer(state.distributions), # int* distr_type
299
+  			size.initial = as.vector(size.initial), # double* initial_size
300
+  			prob.initial = as.vector(prob.initial), # double* initial_prob
301
+  			A.initial = as.vector(A.initial), # double* initial_A
302
+  			proba.initial = as.vector(proba.initial), # double* initial_proba
303
+  			use.initial.params = as.logical(1), # bool* use_initial_params
304
+  			num.threads = as.integer(num.threads), # int* num_threads
305
+  			error = as.integer(0), # int* error (error handling)
306
+  			count.cutoff = as.integer(count.cutoff), # int* count.cutoff
307
+  			algorithm = as.integer(algorithm), # int* algorithm
308
+  			PACKAGE = 'AneuFinder'
309
+  		)
310
+  
311
+  		hmm$eps <- eps.try
312
+  		if (num.trials > 1) {
313
+  			if (hmm$loglik.delta > hmm$eps & istep == 1) {
314
+  				warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge in trial run ",i_try,"!\n"))
315
+  			}
316
+  			# Store model in list
317
+  			hmm$counts <- NULL
318
+  			modellist[[as.character(i_try)]] <- hmm
319
+  			init <- 'random'
320
+  		} else if (num.trials == 1) {
321
+  			if (hmm$loglik.delta > eps & istep == 1) {
322
+  				warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge!\n"))
323
+  			}
324
+  		}
325
+  	}
326
+  
327
+  	if (num.trials > 1) {
328
+  
329
+  		# Mathematically we should select the fit with highest loglikelihood. If we think the fit with the highest loglikelihood is incorrect, we should change the underlying model. However, this is very complex and we choose to select a fit that we think is (more) correct, although it has not the highest support given our (imperfect) model.
330
+  		if (length(modellist)>1) {
331
+  # 		  ## Get some metrics
332
+  # 		  num.segs <- sapply(modellist, function(x) { length(rle(x$states)$values) })
333
+  # 			logliks <- sapply(modellist,'[[','loglik')
334
+  # 			bhattacharyya <- numeric(length=length(modellist))
335
+  # 			sos <- numeric(length=length(modellist))
336
+  # 			diff.most.frequent <- numeric(length=length(modellist))
337
+  # 			ind.1somy <- which(state.labels=='1-somy')
338
+  # 			for (i1 in 1:length(modellist)) {
339
+  # 			  hmm <- modellist[[i1]]
340
+  #   			x <- 0:max(max(counts), 500)
341
+  #   			bhattacharyya[i1] <- -log(sum(sqrt(stats::dnbinom(x, size=hmm$size[ind.1somy], prob=hmm$prob[ind.1somy]) * stats::dnbinom(x, size=hmm$size[ind.1somy+1], prob=hmm$prob[ind.1somy+1]))))
342
+  #   			sos[i1] <- sum( (counts - dnbinom.mean(size=hmm$size, prob=hmm$prob)[hmm$states]) ^ 2 )
343
+  #   			diff.most.frequent[i1] <- 1 - hmm$weights[state.labels==most.frequent.state]
344
+  # 			}
345
+  # 			names(bhattacharyya) <- 1:length(modellist)
346
+  # 			names(sos) <- 1:length(modellist)
347
+  # 			names(diff.most.frequent) <- 1:length(modellist)
348
+  # 			# Diff to most.frequen.state
349
+  # 			df.select <- data.frame(num.seg=num.segs, loglik=logliks, bhattacharyya=bhattacharyya, sos=sos, diff.most.frequent=diff.most.frequent)
350
+  # 			## Get probability of finding value
351
+  # 			num.segs <- ecdf(num.segs)(num.segs)
352
+  # 			logliks <- 1-ecdf(logliks)(logliks)
353
+  # 			bhattacharyya <- 1-ecdf(bhattacharyya)(bhattacharyya)
354
+  # 			sos <- ecdf(sos)(sos)
355
+  # 			diff.most.frequent <- ecdf(diff.most.frequent)(diff.most.frequent)
356
+  # 			df.order <- data.frame(num.seg=num.segs, loglik=logliks, bhattacharyya=bhattacharyya, sos=sos, diff.most.frequent=diff.most.frequent)
357
+  # 			## Get sum for each tuple
358
+  # 			ind <- rowSums(df.order)
359
+  # 			## Select the one that has lowest sum
360
+  # 			index2use <- which.min(ind)
361
+  # 			result$dforder <- df.order
362
+  # 			result$dfselect <- df.select
363
+  # 			result$dfi <- index2use
364
+  		  
365
+  			## Select models where weight of most.frequent.state is at least half of that of actual most frequent state, then select model with highest loglik
366
+  			logliks <- sapply(modellist,'[[','loglik')
367
+  			df.weight <- as.data.frame(lapply(modellist, '[[', 'weights'))
368
+  			names(df.weight) <- 1:length(modellist)
369
+  			rownames(df.weight) <- state.labels
370
+  			models2use <- df.weight[most.frequent.state,] / apply(df.weight, 2, max) > 0.5
371
+  			models2use[is.na(models2use)] <- FALSE
372
+  			if (any(models2use)) {
373
+  				index2use <- names(which.max(logliks[models2use]))
374
+  			} else {
375
+  				index2use <- names(which.max(logliks))
376
+  			}
377
+  		} else {
378
+  			index2use <- 1
379
+  		}
380
+  		hmm <- modellist[[index2use]]
381
+  
382
+  		# Check if size and prob parameter are correct
383
+  		if (any(is.na(hmm$size) | is.nan(hmm$size) | is.infinite(hmm$size) | is.na(hmm$prob) | is.nan(hmm$prob) | is.infinite(hmm$prob))) {
384
+  			hmm$error <- 3
385
+  		} else {
386
+  
387
+  			# Rerun the HMM with different epsilon and initial parameters from trial run
388
+  			message(paste0("Rerunning trial ",index2use," with eps = ",eps))
389
+  			hmm <- .C("C_univariate_hmm",
390
+  				counts = as.integer(counts), # int* O
391
+  				num.bins = as.integer(numbins), # int* T
392
+  				num.states = as.integer(numstates), # int* N
393
+  				state.labels = as.integer(state.labels), # int* state_labels
394
+  				size = double(length=numstates), # double* size
395
+  				prob = double(length=numstates), # double* prob
396
+  				num.iterations = as.integer(max.iter), #  int* maxiter
397
+  				time.sec = as.integer(max.time), # double* maxtime
398
+  				loglik.delta = as.double(eps), # double* eps
399
+    			maxPosterior = double(length=numbins), # double* maxPosterior
400
+  				states = integer(length=numbins), # int* states
401
+  				A = double(length=numstates*numstates), # double* A
402
+  				proba = double(length=numstates), # double* proba
403
+  				loglik = double(length=1), # double* loglik
404
+  				weights = double(length=numstates), # double* weights
405
+  				distr.type = as.integer(state.distributions), # int* distr_type
406
+  				size.initial = as.vector(hmm$size), # double* initial_size
407
+  				prob.initial = as.vector(hmm$prob), # double* initial_prob
408
+  				A.initial = as.vector(hmm$A), # double* initial_A
409
+  				proba.initial = as.vector(hmm$proba), # double* initial_proba
410
+  				use.initial.params = as.logical(1), # bool* use_initial_params
411
+  				num.threads = as.integer(num.threads), # int* num_threads
412
+  				error = as.integer(0), # int* error (error handling)
413
+  				count.cutoff = as.integer(count.cutoff), # int* count.cutoff
414
+  				algorithm = as.integer(algorithm), # int* algorithm
415
+  				PACKAGE = 'AneuFinder'
416
+  			)
417
+  		}
418
+  
419
+  	} # if (num.trials > 1)
420
+  	
421
+  	if (istep == 1) {
422
+    	### Make return object ###
423
+    	## Check for errors
424
+  		if (hmm$error == 0) {
425
+    	## Bin coordinates and states ###
426
+        result$bins <- binned.data
427
+    		result$bins$state <- state.labels[hmm$states]
428
+    		result$bins$copy.number <- multiplicity[as.character(result$bins$state)]
429
+  		## Parameters
430
+  			# Weights
431
+  			result$weights <- hmm$weights
432
+  			names(result$weights) <- state.labels
433
+  			# Transition matrices
434
+  			transitionProbs <- matrix(hmm$A, ncol=hmm$num.states)
435
+  			rownames(transitionProbs) <- state.labels
436
+  			colnames(transitionProbs) <- state.labels
437
+  			result$transitionProbs <- transitionProbs
438
+  			transitionProbs.initial <- matrix(hmm$A.initial, ncol=hmm$num.states)
439
+  			rownames(transitionProbs.initial) <- state.labels
440
+  			colnames(transitionProbs.initial) <- state.labels
441
+  			result$transitionProbs.initial <- transitionProbs.initial
442
+  			# Initial probs
443
+  			result$startProbs <- hmm$proba
444
+  			names(result$startProbs) <- state.labels
445
+  			result$startProbs.initial <- hmm$proba.initial
446
+  			names(result$startProbs.initial) <- state.labels
447
+  			# Distributions
448
+  				distributions <- data.frame()
449
+  				distributions.initial <- data.frame()
450
+  				for (idistr in 1:length(hmm$distr.type)) {
451
+  					distr <- levels(state.distributions)[hmm$distr.type[idistr]]
452
+  					if (distr == 'dnbinom') {
453
+  						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
454
+  						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
455
+  					} else if (distr == 'dgeom') {
456
+  						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr])))
457
+  						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr])))
458
+  					} else if (distr == 'delta') {
459
+  						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0))
460
+  						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0))
461
+  					} else if (distr == 'dbinom') {
462
+  						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
463
+  						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
464
+  					}
465
+  				}
466
+  				rownames(distributions) <- state.labels
467
+  				rownames(distributions.initial) <- state.labels
468
+  				result$distributions <- distributions
469
+  				result$distributions.initial <- distributions.initial
470
+  		## Convergence info
471
+  			convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error)
472
+  			result$convergenceInfo <- convergenceInfo
473
+  			
474
+  		} else if (hmm$error == 1) {
475
+  			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
476
+  		} else if (hmm$error == 2) {
477
+  			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
478
+  		} else if (hmm$error == 3) {
479
+  			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": NA/NaN/Inf in 'size' or 'prob' parameter detected. This is probably because your binned data contains too few bins."))
480
+  		}
481
+	    if (hmm$error != 0) {
482
+	      result$warlist <- warlist
483
+	      return(result)
484
+	    }
485
+  	} # if (istep == 1)
486
+    	
487
+    if (istep == 1) { ptm <- startTimedMessage("Collecting counts and posteriors ...") }
488
+    
489
+    ## Inflate posteriors, states, counts to new offset
490
+    ind <- findOverlaps(stepbins, binned.data)
491
+    astates.step[ind@from, 'currentOffset'] <- hmm$states[ind@to]
492
+    amaxPosterior.step[ind@from, 'currentOffset'] <- hmm$maxPosterior[ind@to]
493
+    
494
+    ## Find offset that maximizes the posteriors for each bin
495
+    ##-- Start stuff to call C code
496
+    # Work with changing dimensions to avoid copies being made
497
+    dim_amaxPosterior.step <- dim(amaxPosterior.step)
498
+    dimnames_amaxPosterior.step <- dimnames(amaxPosterior.step)
499
+    dim(amaxPosterior.step) <- NULL
500
+    z <- .C("C_array2D_which_max",
501
+            array2D = amaxPosterior.step,
502
+            dim = as.integer(dim_amaxPosterior.step),
503
+            ind_max = integer(dim_amaxPosterior.step[1]),
504
+            value_max = double(dim_amaxPosterior.step[1]))
505
+    dim(amaxPosterior.step) <- dim_amaxPosterior.step
506
+    dimnames(amaxPosterior.step) <- dimnames_amaxPosterior.step
507
+    ind <- z$ind_max
508
+    ##-- End stuff to call C code
509
+    for (i1 in 1:2) {
510
+      mask <- ind == i1
511
+      astates.step[mask, 'previousOffsets'] <- astates.step[mask,i1, drop=FALSE]
512
+      amaxPosterior.step[mask, 'previousOffsets'] <- amaxPosterior.step[mask,i1, drop=FALSE]
513
+    }
514
+    if (istep == 1) { stopTimedMessage(ptm) }
515
+    
516
+    if (istep > 1) {
517
+      stopTimedMessage(ptm.offset)
518
+    }
519
+    
520
+    rm(hmm, ind)
521
+  } # loop over offsets
522
+  states.step <- astates.step[, 'previousOffsets']
523
+  rm(amaxPosterior.step, astates.step); gc()
524
+        
390 525
 	### Make return object ###
391
-	## Check for errors
392
-		if (hmm$error == 0) {
393
-		## Bin coordinates and states ###
394
-			result$bins$state <- state.labels[hmm$states]
395
-			result$bins$copy.number <- multiplicity[as.character(result$bins$state)]
396
-		## Segmentation
397
-			message("Making segmentation ...", appendLF=FALSE)
398
-			ptm <- proc.time()
399
-			suppressMessages(
400
-				result$segments <- as(collapseBins(as.data.frame(result$bins), column2collapseBy='state', columns2drop='width', columns2average=c('counts','mcounts','pcounts')), 'GRanges')
401
-			)
402
-			seqlevels(result$segments) <- seqlevels(result$bins) # correct order from as()
403
-			seqlengths(result$segments) <- seqlengths(binned.data)[names(seqlengths(result$segments))]
404
-			time <- proc.time() - ptm
405
-			message(" ",round(time[3],2),"s")
406
-		## Parameters
407
-			# Weights
408
-			result$weights <- hmm$weights
409
-			names(result$weights) <- state.labels
410
-			# Transition matrices
411
-			transitionProbs <- matrix(hmm$A, ncol=hmm$num.states)
412
-			rownames(transitionProbs) <- state.labels
413
-			colnames(transitionProbs) <- state.labels
414
-			result$transitionProbs <- transitionProbs
415
-			transitionProbs.initial <- matrix(hmm$A.initial, ncol=hmm$num.states)
416
-			rownames(transitionProbs.initial) <- state.labels
417
-			colnames(transitionProbs.initial) <- state.labels
418
-			result$transitionProbs.initial <- transitionProbs.initial
419
-			# Initial probs
420
-			result$startProbs <- hmm$proba
421
-			names(result$startProbs) <- state.labels
422
-			result$startProbs.initial <- hmm$proba.initial
423
-			names(result$startProbs.initial) <- state.labels
424
-			# Distributions
425
-				distributions <- data.frame()
426
-				distributions.initial <- data.frame()
427
-				for (idistr in 1:length(hmm$distr.type)) {
428
-					distr <- levels(state.distributions)[hmm$distr.type[idistr]]
429
-					if (distr == 'dnbinom') {
430
-						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
431
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
432
-					} else if (distr == 'dgeom') {
433
-						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr])))
434
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr])))
435
-					} else if (distr == 'delta') {
436
-						distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0))
437
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0))
438
-					} else if (distr == 'dbinom') {
439
-						distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr])))
440
-						distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr])))
441
-					}
442
-				}
443
-				rownames(distributions) <- state.labels
444
-				rownames(distributions.initial) <- state.labels
445
-				result$distributions <- distributions
446
-				result$distributions.initial <- distributions.initial
447
-		## Convergence info
448
-			convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error)
449
-			result$convergenceInfo <- convergenceInfo
450
-		## Quality info
451
-  		result$qualityInfo <- as.list(getQC(result))
452
-		} else if (hmm$error == 1) {
453
-			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
454
-		} else if (hmm$error == 2) {
455
-			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
456
-		} else if (hmm$error == 3) {
457
-			warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": NA/NaN/Inf in 'size' or 'prob' parameter detected. This is probably because your binned data contains too few bins."))
458
-		}
526
+	## Bin coordinates and states ###
527
+    result$stepbins <- stepbins
528
+		result$stepbins$state <- state.labels[states.step]
529
+		result$stepbins$copy.number <- multiplicity[as.character(result$stepbins$state)]
530
+	## Segmentation
531
+		message("Making segmentation ...", appendLF=FALSE)
532
+		ptm <- proc.time()
533
+		suppressMessages(
534
+			result$segments <- as(collapseBins(as.data.frame(result$stepbins), column2collapseBy='state', columns2drop='width', columns2average=c('counts','mcounts','pcounts')), 'GRanges')
535
+		)
536
+		seqlevels(result$segments) <- seqlevels(result$stepbins) # correct order from as()
537
+		seqlengths(result$segments) <- seqlengths(binned.data)[names(seqlengths(result$segments))]
538
+		time <- proc.time() - ptm
539
+		message(" ",round(time[3],2),"s")
540
+	## Counts
541
+		result$counts <- binned.data.list
542
+	## Quality info
543
+		result$qualityInfo <- as.list(getQC(result))
459 544
 
460 545
 	## Issue warnings
461 546
 	result$warnings <- warlist
... ...
@@ -911,7 +996,11 @@ DNAcopy.findCNVs <- function(binned.data, ID=NULL, CNgrid.start=1.5, count.cutof
911 996
         }
912 997
     }
913 998
   	## Intercept user input
914
-    binned.data <- loadFromFiles(binned.data, check.class='GRanges')[[1]]
999
+    binned.data <- loadFromFiles(binned.data, check.class=c('GRanges', 'GRangesList'))[[1]]
1000
+    if (class(binned.data) == 'GRangesList') {
1001
+      binned.data.list <- binned.data
1002
+      binned.data <- binned.data.list[[1]]
1003
+    }
915 1004
   	if (is.null(ID)) {
916 1005
     		ID <- attr(binned.data, 'ID')
917 1006
   	}
... ...
@@ -2,9 +2,9 @@
2 2
 #'
3 3
 #' Wrapper to load \pkg{\link{AneuFinder}} objects from file and check the class of the loaded objects.
4 4
 #'
5
-#' @param files A list of \code{\link{GRanges}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects or a character vector with files that contain such objects.
6
-#' @param check.class Any combination of \code{c('GRanges', 'aneuHMM', 'aneuBiHMM')}. If any of the loaded objects does not belong to the specified class, an error is thrown.
7
-#' @return A list of \code{\link{GRanges}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects.
5
+#' @param files A list of \code{\link{GRanges}}, \code{\link{GRangesList}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects or a character vector with files that contain such objects.
6
+#' @param check.class Any combination of \code{c('GRanges', 'GRangesList', 'aneuHMM', 'aneuBiHMM')}. If any of the loaded objects does not belong to the specified class, an error is thrown.
7
+#' @return A list of \code{\link{GRanges}}, \code{\link{GRangesList}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects.
8 8
 #' @export
9 9
 #' @examples
10 10
 #'## Get some files that you want to load
... ...
@@ -14,15 +14,15 @@
14 14
 #'hmms <- loadFromFiles(files[1:10])
15 15
 #'lapply(hmms, plot, type='profile')
16 16
 #'
17
-loadFromFiles <- function(files, check.class=c('GRanges', 'aneuHMM', 'aneuBiHMM')) {
17
+loadFromFiles <- function(files, check.class=c('GRanges', 'GRangesList', 'aneuHMM', 'aneuBiHMM')) {
18 18
 
19 19
     # ptm <- startTimedMessage("Loading data from files ...")
20 20
     if (is.null(files)) {
21 21
         # stopTimedMessage(ptm)
22 22
         return(files)
23 23
     }
24
-    if (any(! check.class %in% c('GRanges', class.univariate.hmm, class.bivariate.hmm))) {
25
-        stop("Argument 'check.class' must contain any combination of c('", paste0(c('GRanges', class.univariate.hmm, class.bivariate.hmm), collapse="', '"), "').")
24
+    if (any(! check.class %in% c('GRanges', 'GRangesList', class.univariate.hmm, class.bivariate.hmm))) {
25
+        stop("Argument 'check.class' must contain any combination of c('", paste0(c('GRanges', 'GRangesList', class.univariate.hmm, class.bivariate.hmm), collapse="', '"), "').")
26 26
     }
27 27
     modellist <- list()
28 28
     if (is.character(files)) {
... ...
@@ -114,7 +114,7 @@ getQC <- function(models) {
114 114
             return(x)
115 115
         }
116 116
     }
117
-  	models <- suppressMessages( loadFromFiles(models, check.class=c('GRanges', class.univariate.hmm, class.bivariate.hmm)) )
117
+  	models <- suppressMessages( loadFromFiles(models, check.class=c('GRanges', 'GRangesList', class.univariate.hmm, class.bivariate.hmm)) )
118 118
   	qframe <- list()
119 119
   	for (i1 in 1:length(models)) {
120 120
     		model <- models[[i1]]
... ...
@@ -144,6 +144,19 @@ getQC <- function(models) {
144 144
                                 				bhattacharyya=NA,
145 145
                                 				sos=NA
146 146
                                 				)
147
+    		} else if (class(model) == 'GRangesList') {
148
+    		    bins <- model[[1]]
149
+        		qframe[[i1]] <- data.frame( total.read.count=sum(bins$counts),
150
+                                				avg.binsize=mean(width(bins)),
151
+                                				avg.read.count=mean(bins$counts),
152
+                                				spikiness=qc.spikiness(bins$counts),
153
+                                				entropy=qc.entropy(bins$counts),
154
+                                				complexity=null2na(attr(model,'qualityInfo')$complexity[1]),
155
+                                				loglik=NA,
156
+                                				num.segments=NA,
157
+                                				bhattacharyya=NA,
158
+                                				sos=NA
159
+                                				)
147 160
     		}
148 161
   	}
149 162
   	names(qframe) <- names(models)
... ...
@@ -11,7 +11,7 @@ bivariate.findCNVs(binned.data, ID = NULL, eps = 0.1, init = "standard",
11 11
   most.frequent.state = "1-somy", algorithm = "EM", initial.params = NULL)
12 12
 }
13 13
 \arguments{
14
-\item{binned.data}{A \link{GRanges} object with binned read counts.}
14
+\item{binned.data}{A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.}
15 15
 
16 16
 \item{ID}{An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.}
17 17
 
... ...
@@ -12,7 +12,7 @@ findCNVs(binned.data, ID = NULL, eps = 0.1, init = "standard",
12 12
   initial.params = NULL)
13 13
 }
14 14
 \arguments{
15
-\item{binned.data}{A \link{GRanges} object with binned read counts.}
15
+\item{binned.data}{A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.}
16 16
 
17 17
 \item{ID}{An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.}
18 18
 
... ...
@@ -60,7 +60,7 @@ An \code{\link{aneuHMM}} object.
60 60
 \examples{
61 61
 ## Get an example BED file with single-cell-sequencing reads
62 62
 bedfile <- system.file("extdata", "KK150311_VI_07.bam.bed.gz", package="AneuFinderData")
63
-## Bin the BAM file into bin size 1Mp
63
+## Bin the data into bin size 1Mp
64 64
 binned <- binReads(bedfile, assembly='mm10', binsize=1e6,
65 65
                   chromosomes=c(1:19,'X','Y'))
66 66
 ## Fit the Hidden Markov Model
... ...
@@ -12,7 +12,7 @@ findCNVs.strandseq(binned.data, ID = NULL, eps = 0.1, init = "standard",
12 12
   initial.params = NULL)
13 13
 }
14 14
 \arguments{
15
-\item{binned.data}{A \link{GRanges} object with binned read counts.}
15
+\item{binned.data}{A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.}
16 16
 
17 17
 \item{ID}{An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.}
18 18
 
... ...
@@ -4,15 +4,16 @@
4 4
 \alias{loadFromFiles}
5 5
 \title{Load \pkg{AneuFinder} objects from file}
6 6
 \usage{
7
-loadFromFiles(files, check.class = c("GRanges", "aneuHMM", "aneuBiHMM"))
7
+loadFromFiles(files, check.class = c("GRanges", "GRangesList", "aneuHMM",
8
+  "aneuBiHMM"))
8 9
 }
9 10
 \arguments{
10
-\item{files}{A list of \code{\link{GRanges}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects or a character vector with files that contain such objects.}
11
+\item{files}{A list of \code{\link{GRanges}}, \code{\link{GRangesList}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects or a character vector with files that contain such objects.}
11 12
 
12
-\item{check.class}{Any combination of \code{c('GRanges', 'aneuHMM', 'aneuBiHMM')}. If any of the loaded objects does not belong to the specified class, an error is thrown.}
13
+\item{check.class}{Any combination of \code{c('GRanges', 'GRangesList', 'aneuHMM', 'aneuBiHMM')}. If any of the loaded objects does not belong to the specified class, an error is thrown.}
13 14
 }
14 15
 \value{
15
-A list of \code{\link{GRanges}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects.
16
+A list of \code{\link{GRanges}}, \code{\link{GRangesList}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects.
16 17
 }
17 18
 \description{
18 19
 Wrapper to load \pkg{\link{AneuFinder}} objects from file and check the class of the loaded objects.
... ...
@@ -30,8 +30,7 @@ A \code{\link{GRanges}} object with reads.
30 30
 Merge strand libraries to generate a high-coverage Strand-seq library.
31 31
 }
32 32
 \examples{
33
-files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE)
34
-reads <- mergeStrandseqFiles(files, assembly='hg38')
35
-
33
+#files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE)
34
+#reads <- mergeStrandseqFiles(files, assembly='hg38')
36 35
 }
37 36
 
... ...
@@ -11,7 +11,7 @@ univariate.findCNVs(binned.data, ID = NULL, eps = 0.1, init = "standard",
11 11
   most.frequent.state = "2-somy", algorithm = "EM", initial.params = NULL)
12 12
 }
13 13
 \arguments{
14
-\item{binned.data}{A \link{GRanges} object with binned read counts.}
14
+\item{binned.data}{A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.}
15 15
 
16 16
 \item{ID}{An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.}
17 17
 
... ...
@@ -9,7 +9,7 @@ static double** multiD;
9 9
 // ===================================================================================================================================================
10 10
 // This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the EM and returns the result to R.
11 11
 // ===================================================================================================================================================
12
-void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm)
12
+void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm)
13 13
 {
14 14
 
15 15
 	// Define logging level
... ...
@@ -137,16 +137,16 @@ void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, dou
137 137
 		else { *error = 2; }
138 138
 	}
139 139
 
140
-// 	// Compute the posteriors and save results directly to the R pointer
141
-// 	//FILE_LOG(logDEBUG1) << "Recode posteriors into column representation";
142
-// 	#pragma omp parallel for
143
-// 	for (int iN=0; iN<*N; iN++)
144
-// 	{
145
-// 		for (int t=0; t<*T; t++)
146
-// 		{
147
-// 			posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t);
148
-// 		}
149
-// 	}
140
+	// // Compute the posteriors and save results directly to the R pointer
141
+	// //FILE_LOG(logDEBUG1) << "Recode posteriors into column representation";
142
+	// #pragma omp parallel for
143
+	// for (int iN=0; iN<*N; iN++)
144
+	// {
145
+	// 	for (int t=0; t<*T; t++)
146
+	// 	{
147
+	// 		posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t);
148
+	// 	}
149
+	// }
150 150
 
151 151
 	// Compute the states from posteriors
152 152
 	//FILE_LOG(logDEBUG1) << "Computing states from posteriors";
... ...
@@ -160,6 +160,7 @@ void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, dou
160 160
 		}
161 161
 		ind_max = std::distance(posterior_per_t.begin(), std::max_element(posterior_per_t.begin(), posterior_per_t.end()));
162 162
 		states[t] = state_labels[ind_max];
163
+		maxPosterior[t] = posterior_per_t[ind_max];
163 164
 	}
164 165
 
165 166
 	//FILE_LOG(logDEBUG1) << "Return parameters";
... ...
@@ -365,3 +366,23 @@ void multivariate_cleanup(int* N)
365 366
 	FreeDoubleMatrix(multiD, *N);
366 367
 }
367 368
 
369
+
370
+// ====================================================================
371
+// C version of apply(array2D, 1, which.max) and apply(array2D, 1, max)
372
+// ====================================================================
373
+void array2D_which_max(double* array2D, int* dim, int* ind_max, double* value_max)
374
+{
375
+  // array2D is actually a vector, but is intended to originate from a 2D array in R
376
+	std::vector<double> value_per_i0(dim[1]);
377
+  for (int i0=0; i0<dim[0]; i0++)
378
+  {
379
+    for (int i1=0; i1<dim[1]; i1++)
380
+    {
381
+			value_per_i0[i1] = array2D[i1 * dim[0] + i0];
382
+      // Rprintf("i0=%d, i1=%d, value_per_i0[%d] = %g\n", i0, i1, i1, value_per_i0[i1]);
383
+    }
384
+		ind_max[i0] = 1 + std::distance(value_per_i0.begin(), std::max_element(value_per_i0.begin(), value_per_i0.end()));
385
+    value_max[i0] = *std::max_element(value_per_i0.begin(), value_per_i0.end());
386
+  }
387
+	
388
+}
368 389
\ No newline at end of file
... ...
@@ -13,7 +13,7 @@
13 13
 // #endif
14 14
 
15 15
 extern "C"
16
-void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm);
16
+void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm);
17 17
 
18 18
 extern "C"
19 19
 void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm);
... ...
@@ -24,3 +24,5 @@ void univariate_cleanup();
24 24
 extern "C"
25 25
 void multivariate_cleanup(int* N);
26 26
 
27
+extern "C"
28
+void array2D_which_max(double* array2D, int* dim, int* ind_max, double* value_max);
... ...
@@ -3,15 +3,17 @@
3 3
 #include "R_interface.h"
4 4
 
5 5
 
6
-R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP};
6
+R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP};
7 7
 R_NativePrimitiveArgType arg2[] = {REALSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP};
8 8
 R_NativePrimitiveArgType arg4[] = {INTSXP};
9
+R_NativePrimitiveArgType arg5[] = {REALSXP, INTSXP, INTSXP, REALSXP};
9 10
 
10 11
 static const R_CMethodDef CEntries[]  = {
11
-    {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 24, arg1},
12
+    {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 25, arg1},
12 13
     {"C_multivariate_hmm", (DL_FUNC) &multivariate_hmm, 18, arg2},
13 14
     {"C_univariate_cleanup", (DL_FUNC) &univariate_cleanup, 0, NULL},
14 15
     {"C_multivariate_cleanup", (DL_FUNC) &multivariate_cleanup, 1, arg4},
16
+    {"C_array2D_which_max", (DL_FUNC) &array2D_which_max, 4, arg5},
15 17
     {NULL, NULL, 0, NULL}
16 18
 };
17 19