Browse code

stepsize runs with Aneufinder(strandseq=TRUE)

chakalakka authored on 26/05/2017 11:27:19
Showing 10 changed files

... ...
@@ -2,10 +2,16 @@ CHANGES IN VERSION 1.5.1
2 2
 ------------------------
3 3
 
4 4
 SIGNIFICANT USER-LEVEL CHANGES
5
+
6
+    o GC correction is now done with a loess-fit by default instead of the quadratic fit. This should improve accuracy.
7
+    
8
+    o A stepsize for a sliding window can be selected in addition to the binsize. This improves resolution of detected breakpoints.
9
+    
10
+    o Breakpoints are reported with confidence intervals in folder BROWSERFILES.
5 11
     
6 12
 BUG FIXES
7 13
 
8
-    o Can use BSgenome*NCBI* for GC correction now (worked only for BSgenome*UCSC* before).
14
+    o Can use BSgenome*NCBI* for GC correction (worked only for BSgenome*UCSC* before).
9 15
     
10 16
     
11 17
 CHANGES IN VERSION 1.3.4
... ...
@@ -150,30 +150,3 @@ mergeStrandseqFiles <- function(files, assembly, chromosomes=NULL, pairedEndRead
150 150
     return(reads)
151 151
   
152 152
 }
153
-
154
-
155
-createBlacklistFromReads <- function(reads, binsize=1e5, quantile.cutoff=0.99) {
156
-  
157
-    bins <- binReads(reads, binsizes = binsize)[[1]]
158
-    # bins.split <- split(bins, bins@seqnames)
159
-    # blacklist.list <- GRangesList()
160
-    # for (i1 in 1:length(bins.split)) {
161
-    #     cutoff <- quantile(bins.split[[i1]]$mcounts, quantile.cutoff)
162
-    #     blacklist.list[[i1]] <- bins.split[[i1]][bins.split[[i1]]$mcounts > cutoff]
163
-    # }
164
-    # blacklist <- unlist(blacklist.list, use.names=FALSE)
165
-    cutoff <- quantile(bins$mcounts, quantile.cutoff)
166
-    blacklist <- bins[bins$mcounts >= cutoff]
167
-    # print(as.data.frame(blacklist))
168
-    blacklist <- reduce(blacklist)
169
-    # print(as.data.frame(blacklist))
170
-    frac <- sum(as.numeric(width(blacklist))) / sum(as.numeric(seqlengths(blacklist)))
171
-    message("Fraction of genome in blacklist: ", round(frac,4))
172
-    
173
-    ggplt <- plotProfile(bins, both.strands=TRUE)
174
-    ggplt <- ggplt + geom_hline(data=data.frame(y=-cutoff), mapping=aes_string(yintercept='y'), col='red')
175
-    print(ggplt)
176
-    
177
-    return(blacklist)
178
-
179
-}
... ...
@@ -528,7 +528,7 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
528 528
 		result$bins$state <- state.labels[states.step]
529 529
 		result$bins$copy.number <- multiplicity[as.character(result$bins$state)]
530 530
 	## Counts
531
-		result$binned.data <- binned.data.list
531
+		result$bincounts <- binned.data.list
532 532
 	## Segmentation
533 533
 		message("Making segmentation ...", appendLF=FALSE)
534 534
 		ptm <- proc.time()
... ...
@@ -563,10 +563,16 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard",
563 563
 bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, count.cutoff.quantile=0.999, states=c("zero-inflation",paste0(0:10,"-somy")), most.frequent.state="1-somy", algorithm='EM', initial.params=NULL, verbosity=1) {
564 564
 
565 565
 	## Intercept user input
566
-  binned.data <- loadFromFiles(binned.data, check.class='GRanges')[[1]]
566
+  binned.data <- loadFromFiles(binned.data, check.class=c('GRanges','GRangesList'))[[1]]
567 567
 	if (is.null(ID)) {
568 568
 		ID <- attr(binned.data, 'ID')
569 569
 	}
570
+  if (class(binned.data) == 'GRangesList') {
571
+    binned.data.list <- binned.data
572
+    binned.data <- binned.data.list[[1]]
573
+  } else if (class(binned.data) == 'GRanges') {
574
+    binned.data.list <- GRangesList('0'=binned.data)
575
+  }
570 576
 	if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric")
571 577
 	if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer")
572 578
 	if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer")
... ...
@@ -585,392 +591,471 @@ bivariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", m
585 591
 	if (!is.null(initial.params)) {
586 592
 		init <- 'initial.params'
587 593
 	}
588
-
589
-	warlist <- list()
590
-	if (is.null(eps.try)) eps.try <- eps
591
-
594
+  	
592 595
 	## Variables
593
-	num.bins <- length(binned.data)
594 596
 	algorithm <- factor(algorithm, levels=c('baumWelch','viterbi','EM'))
595
-
596
-	## Get counts
597
-	select <- 'counts'
598
-	counts <- matrix(c(mcols(binned.data)[,paste0('m',select)], mcols(binned.data)[,paste0('p',select)]), ncol=2, dimnames=list(bin=1:num.bins, strand=c('minus','plus')))
599
-	maxcounts <- max(counts)
600
-
601
-	## Filter high counts out, makes HMM faster
602
-	count.cutoff <- quantile(counts, count.cutoff.quantile)
603
-	names.count.cutoff <- names(count.cutoff)
604
-	count.cutoff <- ceiling(count.cutoff)
605
-	mask <- counts > count.cutoff
606
-	counts[mask] <- count.cutoff
607
-	numfiltered <- length(which(mask))
608
-	if (numfiltered > 0) {
609
-		message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to enhance performance."))
610
-	}
611 597
 	
612 598
 	### Make return object
613
-		result <- list()
614
-		class(result) <- class.bivariate.hmm
615
-		result$ID <- ID
616
-		result$bins <- binned.data
599
+	result <- list()
600
+	class(result) <- class.bivariate.hmm
601
+	result$ID <- ID
602
+	result$bins <- binned.data
617 603
 	## Quality info
618
-		result$qualityInfo <- as.list(getQC(binned.data))
604
+	result$qualityInfo <- as.list(getQC(binned.data))
619 605
 
620
-	# Check if there are counts in the data, otherwise HMM will blow up
621
-	if (!any(counts!=0)) {
622
-		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done."))
623
-		result$warnings <- warlist
624
-		return(result)
625
-	} else if (any(counts<0)) {
626
-		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done."))
627
-		result$warnings <- warlist
628
-		return(result)
629
-	}
606
+	warlist <- list()
607
+	if (is.null(eps.try)) eps.try <- eps
630 608
 
631
-	if (init=='initial.params') {
632
-		uni.transitionProbs <- initial.params$univariateParams$transitionProbs
633
-		uni.startProbs <- initial.params$univariateParams$startProbs
634
-		distributions <- initial.params$distributions
635
-		uni.weights <- initial.params$univariateParams$weights
636
-		uni.states <- names(uni.weights)
637
-		num.uni.states <- length(uni.states)
638
-		num.models <- length(distributions)
639
-		num.bins <- length(initial.params$bins)
640
-		comb.states <- factor(names(initial.params$startProbs), levels=names(initial.params$startProbs))
641
-		num.comb.states <- length(comb.states)
642
-		states.list <- list(minus=initial.params$bins$mstate, plus=initial.params$bins$pstate)
643
-		comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states))
644
-		A.initial <- initial.params$transitionProbs
645
-		proba.initial <- initial.params$startProbs
646
-		use.initial <- TRUE
647
-	} else {
648
-		### Stack the strands and run one univariate findCNVs
649
-		message("")
650
-		message(paste(rep('-',getOption('width')), collapse=''))
651
-		binned.data.minus <- binned.data
652
-		strand(binned.data.minus) <- '-'
653
-		binned.data.minus$counts <- counts[,'minus', drop=FALSE]
654
-		binned.data.plus <- binned.data
655
-		strand(binned.data.plus) <- '+'
656
-		binned.data.plus$counts <- counts[,'plus', drop=FALSE]
657
-		binned.data.stacked <- c(binned.data.minus, binned.data.plus)
658
-		mask.attributes <- c('qualityInfo', 'ID', 'min.mapq')
659
-		attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes]
609
+  ### Arrays for finding maximum posterior for each bin between offsets
610
+  ## Make bins with offset
611
+  ptm <- startTimedMessage("Making bins with offsets ...")
612
+  if (length(binned.data.list) > 1) {
613
+    stepbins <- disjoin(unlist(binned.data.list, use.names=FALSE))
614
+  } else {
615
+    stepbins <- binned.data.list[[1]]
616
+    mcols(stepbins) <- NULL
617
+  }
618
+  amaxPosterior.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset'))) # to store maximum posterior for current and max-of-previous offsets
619
+  astates.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset'))) # to store states for current and max-of-previous offsets
620
+  stopTimedMessage(ptm)
621
+  
622
+  ### Loop over offsets ###
623
+  for (istep in 1:length(binned.data.list)) {
624
+    binned.data <- binned.data.list[[istep]]
625
+  	num.bins <- length(binned.data)
626
+    if (istep > 1) {
627
+      ptm.offset <- startTimedMessage("Obtaining states for step = ", istep, " ...")
628
+      ## Run only one iteration (no updating) if we are already over istep==1
629
+      initial.params <- result
630
+      init <- 'initial.params'
631
+    	algorithm <- factor('baumWelch', levels=c('baumWelch','viterbi','EM'))
632
+      num.trials <- 1
633
+      verbosity <- 0
634
+    }
635
+
636
+  	## Get counts
637
+  	select <- 'counts'
638
+  	counts <- matrix(c(mcols(binned.data)[,paste0('m',select)], mcols(binned.data)[,paste0('p',select)]), ncol=2, dimnames=list(bin=1:num.bins, strand=c('minus','plus')))
639
+  	maxcounts <- max(counts)
640
+  
641
+  	## Filter high counts out, makes HMM faster
642
+  	count.cutoff <- quantile(counts, count.cutoff.quantile)
643
+  	names.count.cutoff <- names(count.cutoff)
644
+  	count.cutoff <- ceiling(count.cutoff)
645
+  	mask <- counts > count.cutoff
646
+  	counts[mask] <- count.cutoff
647
+  	numfiltered <- length(which(mask))
648
+  	if (numfiltered > 0 & istep == 1) {
649
+  		message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to enhance performance."))
650
+  	}
651
+  
652
+  	# Check if there are counts in the data, otherwise HMM will blow up
653
+  	if (!any(counts!=0)) {
654
+  		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done."))
655
+  		result$warnings <- warlist
656
+  		return(result)
657
+  	} else if (any(counts<0)) {
658
+  		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done."))
659
+  		result$warnings <- warlist
660
+  		return(result)
661
+  	}
660 662
 
661
-		# if (method == 'HMM') {
663
+  	if (init=='initial.params') {
664
+  		uni.transitionProbs <- initial.params$univariateParams$transitionProbs
665
+  		uni.startProbs <- initial.params$univariateParams$startProbs
666
+  		distributions <- initial.params$distributions
667
+  		uni.weights <- initial.params$univariateParams$weights
668
+  		uni.states <- names(uni.weights)
669
+  		num.uni.states <- length(uni.states)
670
+  		num.models <- length(distributions)
671
+  		comb.states <- factor(names(initial.params$startProbs), levels=names(initial.params$startProbs))
672
+  		num.comb.states <- length(comb.states)
673
+  		states.list <- list(minus=initial.params$bins$mstate, plus=initial.params$bins$pstate)
674
+  		comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states))
675
+  		A.initial <- initial.params$transitionProbs
676
+  		proba.initial <- initial.params$startProbs
677
+  		use.initial <- TRUE
678
+  	} else {
679
+  		### Stack the strands and run one univariate findCNVs
680
+  		message("")
681
+  		message(paste(rep('-',getOption('width')), collapse=''))
682
+  		binned.data.minus <- binned.data
683
+  		strand(binned.data.minus) <- '-'
684
+  		binned.data.minus$counts <- counts[,'minus', drop=FALSE]
685
+  		binned.data.plus <- binned.data
686
+  		strand(binned.data.plus) <- '+'
687
+  		binned.data.plus$counts <- counts[,'plus', drop=FALSE]
688
+  		binned.data.stacked <- c(binned.data.minus, binned.data.plus)
689
+  		mask.attributes <- c('qualityInfo', 'ID', 'min.mapq')
690
+  		attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes]
691
+  
662 692
   		message("Running univariate HMM")
663 693
   		model.stacked <- univariate.findCNVs(binned.data.stacked, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, count.cutoff.quantile=1, states=states, most.frequent.state=most.frequent.state)
664 694
   		if (is.na(model.stacked$convergenceInfo$error)) {
665 695
   		    result$warnings <- model.stacked$warnings
666 696
   		    return(result)
667 697
   		}
668
-# 		} else if (method == 'dnacopy') {
669
-#   		message("Running DNAcopy")
670
-#   		model.stacked <- DNAcopy.findCNVs(binned.data.stacked, ID, CNgrid.start=0.5, count.cutoff.quantile=1)
671
-# 		}
672
-		model.minus <- model.stacked
673
-		model.minus$bins <- model.minus$bins[strand(model.minus$bins)=='-']
674
-		model.minus$segments <- model.minus$segments[strand(model.minus$segments)=='-']
675
-		model.plus <- model.stacked
676
-		model.plus$bins <- model.plus$bins[strand(model.plus$bins)=='+']
677
-		model.plus$segments <- model.plus$segments[strand(model.plus$segments)=='+']
678
-
679
-		models <- list(minus=model.minus, plus=model.plus)
680
-
681
-		## Extract counts and other stuff
682
-		uni.transitionProbs <- lapply(models, '[[', 'transitionProbs')[[1]]
683
-		uni.startProbs <- lapply(models, '[[', 'startProbs')[[1]]
684
-		distributions <- lapply(models, '[[', 'distributions')
685
-		uni.weights <- lapply(models, '[[', 'weights')[[1]]
686
-		uni.states <- names(uni.weights)
687
-		num.uni.states <- length(uni.states)
688
-		num.models <- length(distributions)
689
-		num.bins <- length(models[[1]]$bins)
690
-		comb.states <- vector()
691
-		levels.state <- levels(models[[1]]$bins$state)
692
-		for (i1 in 1:length(levels.state)) {
693
-			for (i2 in 1:length(levels.state)) {
694
-				comb.state <- paste(levels.state[i1], levels.state[i2])
695
-				comb.states[length(comb.states)+1] <- comb.state
696
-			}
697
-		}
698
-		comb.states <- factor(comb.states, levels=comb.states)
699
-		num.comb.states <- length(comb.states)
700
-		states.list <- lapply(models, function(x) { x$bins$state })
701
-		comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states))
702
-		A.initial <- double(length=num.comb.states*num.comb.states)
703
-		proba.initial <- double(length=num.comb.states)
704
-		use.initial <- FALSE
705
-	}
706
-
707
-	### Prepare the multivariate HMM
708
-	message("")
709
-	message(paste(rep('-',getOption('width')), collapse=''))
710
-	message("Preparing bivariate HMM\n")
711
-
712
-	## Pre-compute z-values for each number of counts
713
-	ptm <- startTimedMessage("Computing pre z-matrix...")
714
-	z.per.count <- array(NA, dim=c(maxcounts+1, num.models, num.uni.states), dimnames=list(counts=0:maxcounts, strand=names(distributions), state=uni.states))
715
-	xcounts <- 0:maxcounts
716
-	for (istrand in 1:num.models) {
717
-		for (istate in 1:num.uni.states) {
718
-			if (distributions[[istrand]][istate,'type']=='dnbinom') {
719
-				size <- distributions[[istrand]][istate,'size']
720
-				prob <- distributions[[istrand]][istate,'prob']
721
-				u <- stats::pnbinom(xcounts, size, prob)
722
-			} else if (distributions[[istrand]][istate,'type']=='dbinom') {
723
-				size <- distributions[[istrand]][istate,'size']
724
-				prob <- distributions[[istrand]][istate,'prob']
725
-				u <- stats::pbinom(xcounts, size, prob)
726
-			} else if (distributions[[istrand]][istate,'type']=='delta') {
727
-				u <- rep(1, length(xcounts))
728
-			} else if (distributions[[istrand]][istate,'type']=='dgeom') {
729
-				prob <- distributions[[istrand]][istate,'prob']
730
-				u <- stats::pgeom(xcounts, prob)
731
-			}
732
-			qnorm_u <- stats::qnorm(u)
733
-			mask <- qnorm_u==Inf
734
-			qnorm_u[mask] <- stats::qnorm(1-1e-16)
735
-			z.per.count[ , istrand, istate] <- qnorm_u
736
-		}
737
-	}
738
-	stopTimedMessage(ptm)
739
-
740
-	## Compute the z matrix
741
-	ptm <- startTimedMessage("Transfering values into z-matrix...")
742
-	z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(distributions), state=uni.states))
743
-	for (istrand in 1:num.models) {
744
-		for (istate in 1:num.uni.states) {
745
-			z.per.bin[ , istrand, istate] = z.per.count[counts[,istrand]+1, istrand, istate]
698
+  		model.minus <- model.stacked
699
+  		model.minus$bins <- model.minus$bins[strand(model.minus$bins)=='-']
700
+  		model.minus$segments <- model.minus$segments[strand(model.minus$segments)=='-']
701
+  		model.plus <- model.stacked
702
+  		model.plus$bins <- model.plus$bins[strand(model.plus$bins)=='+']
703
+  		model.plus$segments <- model.plus$segments[strand(model.plus$segments)=='+']
704
+  
705
+  		models <- list(minus=model.minus, plus=model.plus)
706
+  
707
+  		## Extract counts and other stuff
708
+  		uni.transitionProbs <- lapply(models, '[[', 'transitionProbs')[[1]]
709
+  		uni.startProbs <- lapply(models, '[[', 'startProbs')[[1]]
710
+  		distributions <- lapply(models, '[[', 'distributions')
711
+  		uni.weights <- lapply(models, '[[', 'weights')[[1]]
712
+  		uni.states <- names(uni.weights)
713
+  		num.uni.states <- length(uni.states)
714
+  		num.models <- length(distributions)
715
+  		num.bins <- length(models[[1]]$bins)
716
+  		comb.states <- vector()
717
+  		levels.state <- levels(models[[1]]$bins$state)
718
+  		for (i1 in 1:length(levels.state)) {
719
+  			for (i2 in 1:length(levels.state)) {
720
+  				comb.state <- paste(levels.state[i1], levels.state[i2])
721
+  				comb.states[length(comb.states)+1] <- comb.state
722
+  			}
723
+  		}
724
+  		comb.states <- factor(comb.states, levels=comb.states)
725
+  		num.comb.states <- length(comb.states)
726
+  		states.list <- lapply(models, function(x) { x$bins$state })
727
+  		comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states))
728
+  		A.initial <- double(length=num.comb.states*num.comb.states)
729
+  		proba.initial <- double(length=num.comb.states)
730
+  		use.initial <- FALSE
731
+  	}
732
+  
733
+  	### Prepare the multivariate HMM
734
+  	if (verbosity >= 1 & istep == 1) {
735
+    	message("")
736
+    	message(paste(rep('-',getOption('width')), collapse=''))
737
+    	message("Preparing bivariate HMM\n")
738
+  	}
739
+  
740
+  	## Pre-compute z-values for each number of counts
741
+		if (verbosity >=1) {
742
+    	ptm <- startTimedMessage("Computing pre z-matrix...")
746 743
 		}
747
-	}
748
-	remove(z.per.count)
749
-	stopTimedMessage(ptm)
750
-
751
-	## Calculate correlation matrix
752
-	ptm <- startTimedMessage("Computing inverse of correlation matrix...")
753
-	correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(distributions), strand=names(distributions), comb.state=comb.states))
754
-	correlationMatrixInverse <- correlationMatrix
755
-	determinant <- rep(0, num.comb.states)
756
-	names(determinant) <- comb.states
757
-	usestateTF <- rep(NA, num.comb.states) # TRUE, FALSE vector for usable states
758
-	names(usestateTF) <- comb.states
759
-	for (comb.state in comb.states) {
760
-		state <- strsplit(comb.state, ' ')[[1]]
761
-		mask <- which(comb.states.per.bin==comb.state)
762
-		# Subselect z
763
-		z.temp <- matrix(NA, ncol=num.models, nrow=length(mask))
764
-		for (istrand in 1:num.models) {
765
-			z.temp[,istrand] <- z.per.bin[mask, istrand, state[istrand]]
744
+  	z.per.count <- array(NA, dim=c(maxcounts+1, num.models, num.uni.states), dimnames=list(counts=0:maxcounts, strand=names(distributions), state=uni.states))
745
+  	xcounts <- 0:maxcounts
746
+  	for (istrand in 1:num.models) {
747
+  		for (istate in 1:num.uni.states) {
748
+  			if (distributions[[istrand]][istate,'type']=='dnbinom') {
749
+  				size <- distributions[[istrand]][istate,'size']
750
+  				prob <- distributions[[istrand]][istate,'prob']
751
+  				u <- stats::pnbinom(xcounts, size, prob)
752
+  			} else if (distributions[[istrand]][istate,'type']=='dbinom') {
753
+  				size <- distributions[[istrand]][istate,'size']
754
+  				prob <- distributions[[istrand]][istate,'prob']
755
+  				u <- stats::pbinom(xcounts, size, prob)
756
+  			} else if (distributions[[istrand]][istate,'type']=='delta') {
757
+  				u <- rep(1, length(xcounts))
758
+  			} else if (distributions[[istrand]][istate,'type']=='dgeom') {
759
+  				prob <- distributions[[istrand]][istate,'prob']
760
+  				u <- stats::pgeom(xcounts, prob)
761
+  			}
762
+  			qnorm_u <- stats::qnorm(u)
763
+  			mask <- qnorm_u==Inf
764
+  			qnorm_u[mask] <- stats::qnorm(1-1e-16)
765
+  			z.per.count[ , istrand, istate] <- qnorm_u
766
+  		}
767
+  	}
768
+		if (verbosity >=1) {
769
+    	stopTimedMessage(ptm)
766 770
 		}
767
-		temp <- tryCatch({
768
-			if (nrow(z.temp) > 1) {
769
-				correlationMatrix[,,comb.state] <- cor(z.temp)
770
-				determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
771
-				correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
772
-				usestateTF[comb.state] <- TRUE
773
-			} else {
774
-				correlationMatrix[,,comb.state] <- diag(num.models)
775
-				determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
776
-				correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
777
-				usestateTF[comb.state] <- TRUE
778
-			}
779
-			0
780
-		}, warning = function(war) {
781
-			1
782
-		}, error = function(err) {
783
-			1
784
-		})
785
-		if (temp!=0) {
786
-			correlationMatrix[,,comb.state] <- diag(num.models)
787
-			determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
788
-			correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
789
-			usestateTF[comb.state] <- TRUE
771
+  
772
+  	## Compute the z matrix
773
+		if (verbosity >=1) {
774
+    	ptm <- startTimedMessage("Transfering values into z-matrix...")
790 775
 		}
791
-	}
792
-	stopTimedMessage(ptm)
793
-
794
-	# Use only states with valid correlationMatrixInverse (all states are valid in this version)
795
-	correlationMatrix = correlationMatrix[,,usestateTF]
796
-	correlationMatrixInverse = correlationMatrixInverse[,,usestateTF]
797
-	comb.states = comb.states[usestateTF]
798
-	comb.states <- droplevels(comb.states)
799
-	determinant = determinant[usestateTF]
800
-	num.comb.states <- length(comb.states)
801
-
802
-	### Calculate multivariate densities for each state ###
803
-	ptm <- startTimedMessage("Calculating multivariate densities...")
804
-	densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states))
805
-	for (comb.state in comb.states) {
806
-		istate <- which(comb.state==comb.states)
807
-		state <- strsplit(comb.state, ' ')[[1]]
808
-		z.temp <- matrix(NA, ncol=num.models, nrow=num.bins)
809
-		product <- 1
810
-		for (istrand in 1:num.models) {
811
-			z.temp[,istrand] <- z.per.bin[, istrand, state[istrand]]
812
-			if (distributions[[istrand]][state[istrand],'type'] == 'dnbinom') {
813
-				size <- distributions[[istrand]][state[istrand],'size']
814
-				prob <- distributions[[istrand]][state[istrand],'prob']
815
-				product <- product * stats::dnbinom(counts[,istrand], size, prob)
816
-			} else if (distributions[[istrand]][state[istrand],'type'] == 'dgeom') {
817
-				prob <- distributions[[istrand]][state[istrand],'prob']
818
-				product <- product * stats::dgeom(counts[,istrand], prob)
819
-			} else if (distributions[[istrand]][state[istrand],'type'] == 'delta') {
820
-				product <- product * ifelse(counts[,istrand]==0, 1, 0)
821
-			}
776
+  	z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(distributions), state=uni.states))
777
+  	for (istrand in 1:num.models) {
778
+  		for (istate in 1:num.uni.states) {
779
+  			z.per.bin[ , istrand, istate] = z.per.count[counts[,istrand]+1, istrand, istate]
780
+  		}
781
+  	}
782
+  	remove(z.per.count)
783
+		if (verbosity >=1) {
784
+    	stopTimedMessage(ptm)
822 785
 		}
823
-		exponent <- -0.5 * apply( ( z.temp %*% (correlationMatrixInverse[ , , istate] - diag(num.models)) ) * z.temp, 1, sum)
824
-		exponent[is.nan(exponent)] <- 0
825
-		densities[,istate] <- product * determinant[istate]^(-0.5) * exp( exponent )
826
-	}
827
-	# Check if densities are > 1
828
-# 	if (any(densities>1)) stop("Densities > 1")
829
-# 	if (any(densities<0)) stop("Densities < 0")
830
-	densities[densities>1] <- 1
831
-	densities[densities<0] <- 0
832
-	# Check if densities are 0 everywhere in some bins
833
-	check <- which(apply(densities, 1, sum) == 0)
834
-	if (length(check)>0) {
835
-		if (check[1]==1) {
836
-			densities[1,] <- rep(1e-10, ncol(densities))
837
-			check <- check[-1]
786
+  
787
+  	## Calculate correlation matrix
788
+		if (verbosity >=1) {
789
+    	ptm <- startTimedMessage("Computing inverse of correlation matrix...")
838 790
 		}
839
-		for (icheck in check) {
840
-			densities[icheck,] <- densities[icheck-1,]
791
+  	correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(distributions), strand=names(distributions), comb.state=comb.states))
792
+  	correlationMatrixInverse <- correlationMatrix
793
+  	determinant <- rep(0, num.comb.states)
794
+  	names(determinant) <- comb.states
795
+  	usestateTF <- rep(NA, num.comb.states) # TRUE, FALSE vector for usable states
796
+  	names(usestateTF) <- comb.states
797
+  	for (comb.state in comb.states) {
798
+  		state <- strsplit(comb.state, ' ')[[1]]
799
+  		mask <- which(comb.states.per.bin==comb.state)
800
+  		# Subselect z
801
+  		z.temp <- matrix(NA, ncol=num.models, nrow=length(mask))
802
+  		for (istrand in 1:num.models) {
803
+  			z.temp[,istrand] <- z.per.bin[mask, istrand, state[istrand]]
804
+  		}
805
+  		temp <- tryCatch({
806
+  			if (nrow(z.temp) > 1) {
807
+  				correlationMatrix[,,comb.state] <- cor(z.temp)
808
+  				determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
809
+  				correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
810
+  				usestateTF[comb.state] <- TRUE
811
+  			} else {
812
+  				correlationMatrix[,,comb.state] <- diag(num.models)
813
+  				determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
814
+  				correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
815
+  				usestateTF[comb.state] <- TRUE
816
+  			}
817
+  			0
818
+  		}, warning = function(war) {
819
+  			1
820
+  		}, error = function(err) {
821
+  			1
822
+  		})
823
+  		if (temp!=0) {
824
+  			correlationMatrix[,,comb.state] <- diag(num.models)
825
+  			determinant[comb.state] <- det( correlationMatrix[,,comb.state] )
826
+  			correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state])
827
+  			usestateTF[comb.state] <- TRUE
828
+  		}
829
+  	}
830
+		if (verbosity >=1) {
831
+    	stopTimedMessage(ptm)
841 832
 		}
842
-	}
843
-	stopTimedMessage(ptm)
844
-		
845
-	### Define cleanup behaviour ###
846
-	on.exit(.C("C_multivariate_cleanup", as.integer(num.comb.states), PACKAGE = 'AneuFinder'))
847
-
848
-	### Run the multivariate HMM
849
-	# Call the C function
850
-	hmm <- .C("C_multivariate_hmm",
851
-		densities = as.double(densities), # double* D
852
-		num.bins = as.integer(num.bins), # int* T
853
-		num.comb.states = as.integer(num.comb.states), # int* N
854
-		num.strands = as.integer(num.models), # int* Nmod
855
-		comb.states = as.integer(comb.states), # int* comb_states
856
-		num.iterations = as.integer(max.iter), # int* maxiter
857
-		time.sec = as.integer(max.time), # double* maxtime
858
-		loglik.delta = as.double(eps), # double* eps
859
-		states = integer(length=num.bins), # int* states
860
-		A = double(length=num.comb.states*num.comb.states), # double* A
861
-		proba = double(length=num.comb.states), # double* proba
862
-		loglik = double(length=1), # double* loglik
863
-		A.initial = as.vector(A.initial), # double* initial_A
864
-		proba.initial = as.vector(proba.initial), # double* initial_proba
865
-		use.initial.params = as.logical(use.initial), # bool* use_initial_params
866
-		num.threads = as.integer(num.threads), # int* num_threads
867
-		error = as.integer(0), # error handling
868
-		algorithm = as.integer(algorithm), # int* algorithm
869
-		verbosity = as.integer(verbosity), # int* verbosity
870
-		PACKAGE = 'AneuFinder'
871
-		)
872
-			
873
-	### Check convergence ###
874
-	war <- NULL
875
-	if (hmm$loglik.delta > eps) {
876
-		war <- warning(paste0("ID = ",ID,": HMM did not converge!\n"))
877
-	}
878
-	if (hmm$error == 1) {
879
-		warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
880
-	} else if (hmm$error == 2) {
881
-		warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
882
-	}
883
-
884
-	### Make return object ###
885
-	if (hmm$error == 0) {
886
-		result <- list()
887
-		class(result) <- class.bivariate.hmm
888
-		result$ID <- ID
889
-	## Bin coordinates and states
890
-		result$bins <- binned.data
891
-		result$bins$state <- comb.states[hmm$states]
892
-		# Get states as factors in data.frame
893
-		matrix.states <- matrix(unlist(strsplit(as.character(result$bins$state), split=' ')), byrow=TRUE, ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(distributions)))
894
-		names <- c('mstate','pstate')
895
-		for (i1 in 1:num.models) {
896
-			mcols(result$bins)[names[i1]] <- factor(matrix.states[,i1], levels=uni.states)
833
+  
834
+  	# Use only states with valid correlationMatrixInverse (all states are valid in this version)
835
+  	correlationMatrix = correlationMatrix[,,usestateTF]
836
+  	correlationMatrixInverse = correlationMatrixInverse[,,usestateTF]
837
+  	comb.states = comb.states[usestateTF]
838
+  	comb.states <- droplevels(comb.states)
839
+  	determinant = determinant[usestateTF]
840
+  	num.comb.states <- length(comb.states)
841
+  
842
+  	### Calculate multivariate densities for each state ###
843
+		if (verbosity >=1) {
844
+    	ptm <- startTimedMessage("Calculating multivariate densities...")
897 845
 		}
898
-	## Segmentation
899
-		ptm <- startTimedMessage("Making segmentation ...")
900
-		suppressMessages(
901
-			result$segments <- as(collapseBins(as.data.frame(result$bins), column2collapseBy='state', columns2drop='width', columns2average=c('counts','mcounts','pcounts')), 'GRanges')
902
-		)
903
-		seqlevels(result$segments) <- seqlevels(result$bins) # correct order from as()
904
-		seqlengths(result$segments) <- seqlengths(result$bins)[names(seqlengths(result$segments))]
905
-		stopTimedMessage(ptm)
906
-	## CNV state for both strands combined
907
-		getnumbers <- function(x) {
908
-    		x <- sub("zero-inflation", "0-somy", x)
909
-    		x <- as.numeric(sub("-somy", "", x))
846
+  	densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states))
847
+  	for (comb.state in comb.states) {
848
+  		istate <- which(comb.state==comb.states)
849
+  		state <- strsplit(comb.state, ' ')[[1]]
850
+  		z.temp <- matrix(NA, ncol=num.models, nrow=num.bins)
851
+  		product <- 1
852
+  		for (istrand in 1:num.models) {
853
+  			z.temp[,istrand] <- z.per.bin[, istrand, state[istrand]]
854
+  			if (distributions[[istrand]][state[istrand],'type'] == 'dnbinom') {
855
+  				size <- distributions[[istrand]][state[istrand],'size']
856
+  				prob <- distributions[[istrand]][state[istrand],'prob']
857
+  				product <- product * stats::dnbinom(counts[,istrand], size, prob)
858
+  			} else if (distributions[[istrand]][state[istrand],'type'] == 'dgeom') {
859
+  				prob <- distributions[[istrand]][state[istrand],'prob']
860
+  				product <- product * stats::dgeom(counts[,istrand], prob)
861
+  			} else if (distributions[[istrand]][state[istrand],'type'] == 'delta') {
862
+  				product <- product * ifelse(counts[,istrand]==0, 1, 0)
863
+  			}
864
+  		}
865
+  		exponent <- -0.5 * apply( ( z.temp %*% (correlationMatrixInverse[ , , istate] - diag(num.models)) ) * z.temp, 1, sum)
866
+  		exponent[is.nan(exponent)] <- 0
867
+  		densities[,istate] <- product * determinant[istate]^(-0.5) * exp( exponent )
868
+  	}
869
+  	# Check if densities are > 1
870
+  # 	if (any(densities>1)) stop("Densities > 1")
871
+  # 	if (any(densities<0)) stop("Densities < 0")
872
+  	densities[densities>1] <- 1
873
+  	densities[densities<0] <- 0
874
+  	# Check if densities are 0 everywhere in some bins
875
+  	check <- which(apply(densities, 1, sum) == 0)
876
+  	if (length(check)>0) {
877
+  		if (check[1]==1) {
878
+  			densities[1,] <- rep(1e-10, ncol(densities))
879
+  			check <- check[-1]
880
+  		}
881
+  		for (icheck in check) {
882
+  			densities[icheck,] <- densities[icheck-1,]
883
+  		}
884
+  	}
885
+		if (verbosity >=1) {
886
+    	stopTimedMessage(ptm)
910 887
 		}
911
-  	inistates <-  suppressWarnings( initializeStates(unique(c(states, paste0(sort(unique(getnumbers(result$bins$mstate) + getnumbers(result$bins$pstate))),"-somy")))) )
912
-  	multiplicity <- inistates$multiplicity
913
-  	state.labels <- inistates$states
914
-		# Bins
915
-		state <- multiplicity[as.character(result$bins$mstate)] + multiplicity[as.character(result$bins$pstate)]
916
-		# state[state>max(multiplicity)] <- max(multiplicity)
917
-		multiplicity.inverse <- names(multiplicity)
918
-		names(multiplicity.inverse) <- multiplicity
919
-		state <- multiplicity.inverse[as.character(state)]
920
-		state[(result$bins$mstate=='0-somy' | result$bins$pstate=='0-somy') & state=='zero-inflation'] <- '0-somy'
921
-    result$bins$state <- factor(state, levels=names(multiplicity))
922
-    result$bins$copy.number <- multiplicity[as.character(result$bins$state)]
923
-    result$bins$mcopy.number <- multiplicity[as.character(result$bins$mstate)]
924
-    result$bins$pcopy.number <- multiplicity[as.character(result$bins$pstate)]
925
-		# Segments
926
-		str <- strsplit(as.character(result$segments$state),' ')
927
-		result$segments$mstate <- factor(unlist(lapply(str, '[[', 1)), levels=state.labels)
928
-		result$segments$pstate <- factor(unlist(lapply(str, '[[', 2)), levels=state.labels)
929
-		state <- multiplicity[result$segments$mstate] + multiplicity[result$segments$pstate]
930
-		state[state>max(multiplicity)] <- max(multiplicity)
931
-		multiplicity.inverse <- names(multiplicity)
932
-		names(multiplicity.inverse) <- multiplicity
933
-		state <- multiplicity.inverse[as.character(state)]
934
-		state[(result$segments$mstate=='0-somy' | result$segments$pstate=='0-somy') & state=='zero-inflation'] <- '0-somy'
935
-    result$segments$state <- factor(state, levels=names(multiplicity))
936
-    result$segments$copy.number <- multiplicity[as.character(result$segments$state)]
937
-    result$segments$mcopy.number <- multiplicity[as.character(result$segments$mstate)]
938
-    result$segments$pcopy.number <- multiplicity[as.character(result$segments$pstate)]
939
-		## Parameters
940
-			# Weights
941
-			tstates <- table(result$bins$state)
942
-			result$weights <- tstates/sum(tstates)
943
-			# Transition matrices
944
-			result$transitionProbs <- matrix(hmm$A, ncol=num.comb.states)
945
-			colnames(result$transitionProbs) <- comb.states
946
-			rownames(result$transitionProbs) <- comb.states
947
-			result$transitionProbs.initial <- matrix(hmm$A.initial, ncol=num.comb.states)
948
-			colnames(result$transitionProbs.initial) <- comb.states
949
-			rownames(result$transitionProbs.initial) <- comb.states
950
-			# Initial probs
951
-			result$startProbs <- hmm$proba
952
-			names(result$startProbs) <- comb.states
953
-			result$startProbs.initial <- hmm$proba.initial
954
-			names(result$startProbs.initial) <- comb.states
955
-			# Distributions
956
-			result$distributions <- distributions
957
-		## Convergence info
958
-			convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec)
959
-			result$convergenceInfo <- convergenceInfo
960
-		## Quality info
961
-  		result$qualityInfo <- as.list(getQC(result))
962
-		## Univariate infos
963
-			univariateParams <- list(transitionProbs=uni.transitionProbs, startProbs=uni.startProbs, distributions=distributions[[1]], weights=uni.weights)
964
-			result$univariateParams <- univariateParams
965
-	} else if (hmm$error == 1) {
966
-		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
967
-	} else if (hmm$error == 2) {
968
-		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
969
-	}
970
-
971
-	## Issue warnings
972
-	result$warnings <- warlist
888
+  		
889
+  	### Define cleanup behaviour ###
890
+  	on.exit(.C("C_multivariate_cleanup", as.integer(num.comb.states), PACKAGE = 'AneuFinder'))
891
+  
892
+  	### Run the multivariate HMM
893
+  	# Call the C function
894
+  	hmm <- .C("C_multivariate_hmm",
895
+  		densities = as.double(densities), # double* D
896
+  		num.bins = as.integer(num.bins), # int* T
897
+  		num.comb.states = as.integer(num.comb.states), # int* N
898
+  		num.strands = as.integer(num.models), # int* Nmod
899
+  		comb.states = as.integer(comb.states), # int* comb_states
900
+  		num.iterations = as.integer(max.iter), # int* maxiter
901
+  		time.sec = as.integer(max.time), # double* maxtime
902
+  		loglik.delta = as.double(eps), # double* eps
903
+			maxPosterior = double(length=num.bins), # double* maxPosterior
904
+  		states = integer(length=num.bins), # int* states
905
+  		A = double(length=num.comb.states*num.comb.states), # double* A
906
+  		proba = double(length=num.comb.states), # double* proba
907
+  		loglik = double(length=1), # double* loglik
908
+  		A.initial = as.vector(A.initial), # double* initial_A
909
+  		proba.initial = as.vector(proba.initial), # double* initial_proba
910
+  		use.initial.params = as.logical(use.initial), # bool* use_initial_params
911
+  		num.threads = as.integer(num.threads), # int* num_threads
912
+  		error = as.integer(0), # error handling
913
+  		algorithm = as.integer(algorithm), # int* algorithm
914
+  		verbosity = as.integer(verbosity), # int* verbosity
915
+  		PACKAGE = 'AneuFinder'
916
+  		)
917
+  			
918
+  	### Check convergence ###
919
+  	war <- NULL
920
+  	if (hmm$loglik.delta > eps) {
921
+  		war <- warning(paste0("ID = ",ID,": HMM did not converge!\n"))
922
+  	}
923
+  	if (hmm$error == 1) {
924
+  		warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
925
+  	} else if (hmm$error == 2) {
926
+  		warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
927
+  	}
973 928
 
929
+  	if (istep == 1) {
930
+    	### Make return object ###
931
+    	if (hmm$error == 0) {
932
+    		result <- list()
933
+    		class(result) <- class.bivariate.hmm
934
+    		result$ID <- ID
935
+      	## Bin coordinates and states
936
+    		result$bins <- binned.data
937
+    		result$bins$state <- comb.states[hmm$states]
938
+    		# Get states as factors in data.frame
939
+    		matrix.states <- matrix(unlist(strsplit(as.character(result$bins$state), split=' ')), byrow=TRUE, ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(distributions)))
940
+    		names <- c('mstate','pstate')
941
+    		for (i1 in 1:num.models) {
942
+    			mcols(result$bins)[names[i1]] <- factor(matrix.states[,i1], levels=uni.states)
943
+    		}
944
+      	## Segmentation
945
+    		ptm <- startTimedMessage("Making segmentation ...")
946
+    		suppressMessages(
947
+    			result$segments <- as(collapseBins(as.data.frame(result$bins), column2collapseBy='state', columns2drop='width', columns2average=c('counts','mcounts','pcounts')), 'GRanges')
948
+    		)
949
+    		seqlevels(result$segments) <- seqlevels(result$bins) # correct order from as()
950
+    		seqlengths(result$segments) <- seqlengths(result$bins)[names(seqlengths(result$segments))]
951
+    		stopTimedMessage(ptm)
952
+      	## CNV state for both strands combined
953
+    		getnumbers <- function(x) {
954
+        		x <- sub("zero-inflation", "0-somy", x)
955
+        		x <- as.numeric(sub("-somy", "", x))
956
+    		}
957
+      	inistates <-  suppressWarnings( initializeStates(unique(c(states, paste0(sort(unique(getnumbers(result$bins$mstate) + getnumbers(result$bins$pstate))),"-somy")))) )
958
+      	multiplicity <- inistates$multiplicity
959
+      	state.labels <- inistates$states
960
+    		# Bins
961
+    		state <- multiplicity[as.character(result$bins$mstate)] + multiplicity[as.character(result$bins$pstate)]
962
+    		# state[state>max(multiplicity)] <- max(multiplicity)
963
+    		multiplicity.inverse <- names(multiplicity)
964
+    		names(multiplicity.inverse) <- multiplicity
965
+    		state <- multiplicity.inverse[as.character(state)]
966
+    		state[(result$bins$mstate=='0-somy' | result$bins$pstate=='0-somy') & state=='zero-inflation'] <- '0-somy'
967
+        result$bins$state <- factor(state, levels=names(multiplicity))
968
+        result$bins$copy.number <- multiplicity[as.character(result$bins$state)]
969
+        result$bins$mcopy.number <- multiplicity[as.character(result$bins$mstate)]
970
+        result$bins$pcopy.number <- multiplicity[as.character(result$bins$pstate)]
971
+    		# Segments
972
+    		str <- strsplit(as.character(result$segments$state),' ')
973
+    		result$segments$mstate <- factor(unlist(lapply(str, '[[', 1)), levels=state.labels)
974
+    		result$segments$pstate <- factor(unlist(lapply(str, '[[', 2)), levels=state.labels)
975
+    		state <- multiplicity[result$segments$mstate] + multiplicity[result$segments$pstate]
976
+    		state[state>max(multiplicity)] <- max(multiplicity)
977
+    		multiplicity.inverse <- names(multiplicity)
978
+    		names(multiplicity.inverse) <- multiplicity
979
+    		state <- multiplicity.inverse[as.character(state)]
980
+    		state[(result$segments$mstate=='0-somy' | result$segments$pstate=='0-somy') & state=='zero-inflation'] <- '0-somy'
981
+        result$segments$state <- factor(state, levels=names(multiplicity))
982
+        result$segments$copy.number <- multiplicity[as.character(result$segments$state)]
983
+        result$segments$mcopy.number <- multiplicity[as.character(result$segments$mstate)]
984
+        result$segments$pcopy.number <- multiplicity[as.character(result$segments$pstate)]
985
+    		## Parameters
986
+    			# Weights
987
+    			tstates <- table(result$bins$state)
988
+    			result$weights <- tstates/sum(tstates)
989
+    			# Transition matrices
990
+    			result$transitionProbs <- matrix(hmm$A, ncol=num.comb.states)
991
+    			colnames(result$transitionProbs) <- comb.states
992
+    			rownames(result$transitionProbs) <- comb.states
993
+    			result$transitionProbs.initial <- matrix(hmm$A.initial, ncol=num.comb.states)
994
+    			colnames(result$transitionProbs.initial) <- comb.states
995
+    			rownames(result$transitionProbs.initial) <- comb.states
996
+    			# Initial probs
997
+    			result$startProbs <- hmm$proba
998
+    			names(result$startProbs) <- comb.states
999
+    			result$startProbs.initial <- hmm$proba.initial
1000
+    			names(result$startProbs.initial) <- comb.states
1001
+    			# Distributions
1002
+    			result$distributions <- distributions
1003
+    		## Convergence info
1004
+    			convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec)
1005
+    			result$convergenceInfo <- convergenceInfo
1006
+    		## Quality info
1007
+      		result$qualityInfo <- as.list(getQC(result))
1008
+    		## Univariate infos
1009
+    			univariateParams <- list(transitionProbs=uni.transitionProbs, startProbs=uni.startProbs, distributions=distributions[[1]], weights=uni.weights)
1010
+    			result$univariateParams <- univariateParams
1011
+    	} else if (hmm$error == 1) {
1012
+    		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to."))
1013
+    	} else if (hmm$error == 2) {
1014
+    		warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library"))
1015
+    	}
1016
+  	  if (hmm$error != 0) {
1017
+      	## Issue warnings
1018
+      	result$warnings <- warlist
1019
+      	return(result)
1020
+  	  }
1021
+  	} # if (istep==1)
1022
+    	
1023
+    if (istep == 1) { ptm <- startTimedMessage("Collecting counts and posteriors ...") }
1024
+    
1025
+    ## Inflate posteriors, states, counts to new offset
1026
+    ind <- findOverlaps(stepbins, binned.data)
1027
+    astates.step[ind@from, 'currentOffset'] <- hmm$states[ind@to]
1028
+    amaxPosterior.step[ind@from, 'currentOffset'] <- hmm$maxPosterior[ind@to]
1029
+    
1030
+    ## Find offset that maximizes the posteriors for each bin
1031
+    ##-- Start stuff to call C code
1032
+    # Work with changing dimensions to avoid copies being made
1033
+    dim_amaxPosterior.step <- dim(amaxPosterior.step)
1034
+    dimnames_amaxPosterior.step <- dimnames(amaxPosterior.step)
1035
+    dim(amaxPosterior.step) <- NULL
1036
+    z <- .C("C_array2D_which_max",
1037
+            array2D = amaxPosterior.step,
1038
+            dim = as.integer(dim_amaxPosterior.step),
1039
+            ind_max = integer(dim_amaxPosterior.step[1]),
1040
+            value_max = double(dim_amaxPosterior.step[1]))
1041
+    dim(amaxPosterior.step) <- dim_amaxPosterior.step
1042
+    dimnames(amaxPosterior.step) <- dimnames_amaxPosterior.step
1043
+    ind <- z$ind_max
1044
+    ##-- End stuff to call C code
1045
+    for (i1 in 1:2) {
1046
+      mask <- ind == i1
1047
+      astates.step[mask, 'previousOffsets'] <- astates.step[mask,i1, drop=FALSE]
1048
+      amaxPosterior.step[mask, 'previousOffsets'] <- amaxPosterior.step[mask,i1, drop=FALSE]
1049
+    }
1050
+    if (istep == 1) { stopTimedMessage(ptm) }
1051
+    
1052
+    if (istep > 1) {
1053
+      stopTimedMessage(ptm.offset)
1054
+    }
1055
+    
1056
+    rm(hmm, ind)
1057
+  } # loop over offsets
1058
+  
974 1059
 	## Return results
975 1060
 	return(result)
976 1061
 }
... ...
@@ -1183,7 +1268,11 @@ DNAcopy.findCNVs <- function(binned.data, ID=NULL, CNgrid.start=1.5, count.cutof
1183 1268
 biDNAcopy.findCNVs <- function(binned.data, ID=NULL, CNgrid.start=0.5, count.cutoff.quantile=0.999) {
1184 1269
 
1185 1270
   	## Intercept user input
1186
-    binned.data <- loadFromFiles(binned.data, check.class='GRanges')[[1]]
1271
+    binned.data <- loadFromFiles(binned.data, check.class=c('GRanges','GRangesList'))[[1]]
1272
+    if (class(binned.data) == 'GRangesList') {
1273
+      binned.data.list <- binned.data
1274
+      binned.data <- binned.data.list[[1]]
1275
+    }
1187 1276
   	if (is.null(ID)) {
1188 1277
     		ID <- attr(binned.data, 'ID')
1189 1278
   	}
... ...
@@ -28,10 +28,7 @@
28 28
 findCNVs.strandseq <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=5, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c('zero-inflation',paste0(0:10,'-somy')), most.frequent.state="1-somy", method='HMM', algorithm="EM", initial.params=NULL) {
29 29
 
30 30
 	## Intercept user input
31
-	if (class(binned.data) != 'GRanges') {
32
-		binned.data <- get(load(binned.data))
33
-		if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'counts' or a file that contains such an object")
34
-	}
31
+  binned.data <- loadFromFiles(binned.data, check.class=c('GRanges','GRangesList'))[[1]]
35 32
 	if (is.null(ID)) {
36 33
 		ID <- attr(binned.data, 'ID')
37 34
 	}
... ...
@@ -91,6 +91,31 @@ plot.GRanges <- function(x, type='profile', ...) {
91 91
 	}
92 92
 }
93 93
 
94
+#' Plotting function for binned read counts (list)
95
+#'
96
+#' Make plots for binned read counts (list) from \code{\link{binned.data}}.
97
+#'
98
+#' @param x A \code{\link{GRangesList}} object with binned read counts.
99
+#' @param type Type of the plot, one of \code{c('profile', 'histogram', 'karyogram')}. You can also specify the type with an integer number.
100
+#' \describe{
101
+#'   \item{\code{karyogram}}{A karyogram-like chromosome overview with read counts.}
102
+#'   \item{\code{histogram}}{A histogram of read counts.}
103
+#'   \item{\code{profile}}{An profile with read counts.}
104
+#' }
105
+#' @param ... Additional arguments for the different plot types.
106
+#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object.
107
+#' @method plot GRanges
108
+#' @export
109
+plot.GRangesList <- function(x, type='profile', ...) {
110
+	if (type == 'karyogram' | type==3) {
111
+		plotKaryogram(x, ...)
112
+	} else if (type == 'histogram' | type==2) {
113
+		plotHistogram(x, ...)
114
+	} else if (type == 'profile' | type==1) {
115
+		plotProfile(x, ...)
116
+	}
117
+}
118
+
94 119
 #' Plotting function for \code{\link{aneuHMM}} objects
95 120
 #'
96 121
 #' Make different types of plots for \code{\link{aneuHMM}} objects.
... ...
@@ -215,7 +240,7 @@ plotBivariateHistograms <- function(bihmm) {
215 240
 	uni.hmm$distributions <- bihmm$distributions[[strand]]
216 241
 	uni.hmm$qualityInfo <- bihmm$qualityInfo
217 242
 	class(uni.hmm) <- class.univariate.hmm
218
-	ggplts <- plotHistogram(uni.hmm, strand='*')
243
+	ggplts <- plotHistogram(uni.hmm)
219 244
 
220 245
 	return(ggplts)
221 246
 
... ...
@@ -233,19 +258,22 @@ plotBivariateHistograms <- function(bihmm) {
233 258
 #' @importFrom stats dgeom dnbinom dpois reshape
234 259
 plotHistogram <- function(model) {
235 260
 
236
-    model <- suppressMessages( loadFromFiles(model, check.class=c('GRanges', class.univariate.hmm))[[1]] )
261
+    model <- suppressMessages( loadFromFiles(model, check.class=c('GRanges', 'GRangesList', class.univariate.hmm))[[1]] )
237 262
     if (class(model) == 'GRanges') {
238 263
         bins <- model
239
-        countbins <- model
264
+        bincounts <- model
265
+    } else if (class(model) == 'GRangesList') {
266
+        bins <- model[[1]]
267
+        bincounts <- model[[1]]
240 268
     } else if (class(model) == class.univariate.hmm) {
241 269
         bins <- model$bins
242 270
         if (!is.null(model$bins$counts)) {
243
-            countbins <- model$bins
244
-        } else if (!is.null(model$binned.data[[1]]$counts)) {
245
-            countbins <- model$binned.data[[1]]
271
+            bincounts <- model$bins
272
+        } else if (!is.null(model$bincounts[[1]]$counts)) {
273
+            bincounts <- model$bincounts[[1]]
246 274
         }
247 275
     }
248
-    counts <- countbins$counts
276
+    counts <- bincounts$counts
249 277
 
250 278
   	states <- bins$state
251 279
 		if (!is.null(model$weights)) {
... ...
@@ -361,8 +389,8 @@ plot.karyogram <- function(model, both.strands=FALSE, plot.SCE=FALSE, file=NULL)
361 389
 	## Convert to GRanges
362 390
   if (!is.null(model$bins$counts)) {
363 391
     bins <- model$bins
364
-  } else if (!is.null(model$binned.data[[1]]$counts)) {
365
-    bins <- model$binned.data[[1]]
392
+  } else if (!is.null(model$bincounts[[1]]$counts)) {
393
+    bins <- model$bincounts[[1]]
366 394
     bins$state <- model$bins$state[findOverlaps(bins, model$bins, select='first')]
367 395
   }
368 396
 	bins.split <- split(bins, seqnames(bins))
... ...
@@ -856,6 +884,14 @@ plotProfile <- function(model, both.strands=FALSE, plot.SCE=TRUE, file=NULL, nor
856 884
 		model$bins <- binned.data
857 885
 		model$qualityInfo <- list(entropy=qc.entropy(binned.data$counts), spikiness=qc.spikiness(binned.data$counts), complexity=attr(binned.data, 'complexity'))
858 886
 		plot.profile(model, both.strands=both.strands, plot.SCE=FALSE, file=file)
887
+	} else if (class(model)=='GRangesList') {
888
+		binned.data <- model[[1]]
889
+		model <- list()
890
+		class(model) <- class.univariate.hmm
891
+		model$ID <- ''
892
+		model$bins <- binned.data
893
+		model$qualityInfo <- list(entropy=qc.entropy(binned.data$counts), spikiness=qc.spikiness(binned.data$counts), complexity=attr(binned.data, 'complexity'))
894
+		plot.profile(model, both.strands=both.strands, plot.SCE=FALSE, file=file)
859 895
 	} else if (class(model)==class.univariate.hmm) {
860 896
 		plot.profile(model, both.strands=FALSE, plot.SCE=FALSE, file=file, normalize.counts = normalize.counts)
861 897
 	} else if (class(model)==class.bivariate.hmm) {
... ...
@@ -872,8 +908,8 @@ plot.profile <- function(model, both.strands=FALSE, plot.SCE=TRUE, file=NULL, no
872 908
 	## Convert to GRanges
873 909
   if (!is.null(model$bins$counts)) {
874 910
     bins <- model$bins
875
-  } else if (!is.null(model$binned.data[[1]]$counts)) {
876
-  	bins <- model$binned.data[[1]]
911
+  } else if (!is.null(model$bincounts[[1]]$counts)) {
912
+  	bins <- model$bincounts[[1]]
877 913
   }
878 914
 	## Get breakpoint coordinates
879 915
 	if (is.null(model$breakpoints) & plot.SCE) {
... ...
@@ -56,8 +56,8 @@ qc.bhattacharyya <- function(hmm) {
56 56
   }
57 57
   if (!is.null(hmm$bins$counts)) {
58 58
     counts <- hmm$bins$counts
59
-  } else if (!is.null(hmm$binned.data[[1]]$counts)) {
60
-    counts <- hmm$binned.data[[1]]$counts
59
+  } else if (!is.null(hmm$bincounts[[1]]$counts)) {
60
+    counts <- hmm$bincounts[[1]]$counts
61 61
   }
62 62
 	x <- 0:max(max(counts),500)
63 63
 	if (!is.na(distr['1-somy','size']) & !is.na(distr['2-somy','size'])) {
... ...
@@ -156,8 +156,8 @@ getQC <- function(models) {
156 156
     		    bins <- model$bins
157 157
             if (!is.null(model$bins$counts)) {
158 158
               counts <- model$bins$counts
159
-            } else if (!is.null(model$binned.data[[1]]$counts)) {
160
-              counts <- model$binned.data[[1]]$counts
159
+            } else if (!is.null(model$bincounts[[1]]$counts)) {
160
+              counts <- model$bincounts[[1]]$counts
161 161
             }
162 162
         		qframe[[i1]] <- data.frame( total.read.count=sum(counts),
163 163
                                 				avg.binsize=mean(width(bins)),
164 164
new file mode 100644
... ...
@@ -0,0 +1,27 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/plotting.R
3
+\name{plot.GRangesList}
4
+\alias{plot.GRangesList}
5
+\title{Plotting function for binned read counts (list)}
6
+\usage{
7
+\method{plot}{GRanges}(x, type = "profile", ...)
8
+}
9
+\arguments{
10
+\item{x}{A \code{\link{GRangesList}} object with binned read counts.}
11
+
12
+\item{type}{Type of the plot, one of \code{c('profile', 'histogram', 'karyogram')}. You can also specify the type with an integer number.
13
+\describe{
14
+  \item{\code{karyogram}}{A karyogram-like chromosome overview with read counts.}
15
+  \item{\code{histogram}}{A histogram of read counts.}
16
+  \item{\code{profile}}{An profile with read counts.}
17
+}}
18
+
19
+\item{...}{Additional arguments for the different plot types.}
20
+}
21
+\value{
22
+A \code{\link[ggplot2:ggplot]{ggplot}} object.
23
+}
24
+\description{
25
+Make plots for binned read counts (list) from \code{\link{binned.data}}.
26
+}
27
+
... ...
@@ -214,7 +214,7 @@ void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, dou
214 214
 // =====================================================================================================================================================
215 215
 // This function takes parameters from R, creates a multivariate HMM object, runs the EM and returns the result to R.
216 216
 // =====================================================================================================================================================
217
-void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm, int* verbosity)
217
+void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm, int* verbosity)
218 218
 {
219 219
 
220 220
 	// Define logging level {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"}
... ...
@@ -330,6 +330,7 @@ void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, in
330 330
 		}
331 331
 		ind_max = std::distance(posterior_per_t.begin(), std::max_element(posterior_per_t.begin(), posterior_per_t.end()));
332 332
 		states[t] = comb_states[ind_max];
333
+		maxPosterior[t] = posterior_per_t[ind_max];
333 334
 	}
334 335
 	
335 336
 	//FILE_LOG(logDEBUG1) << "Return parameters";
... ...
@@ -16,7 +16,7 @@ extern "C"
16 16
 void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm, int* verbosity);
17 17
 
18 18
 extern "C"
19
-void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm, int* verbosity);
19
+void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm, int* verbosity);
20 20
 
21 21
 extern "C"
22 22
 void univariate_cleanup();
... ...
@@ -4,13 +4,13 @@
4 4
 
5 5
 
6 6
 R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP};
7
-R_NativePrimitiveArgType arg2[] = {REALSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP};
7
+R_NativePrimitiveArgType arg2[] = {REALSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP};
8 8
 R_NativePrimitiveArgType arg4[] = {INTSXP};
9 9
 R_NativePrimitiveArgType arg5[] = {REALSXP, INTSXP, INTSXP, REALSXP};
10 10
 
11 11
 static const R_CMethodDef CEntries[]  = {
12 12
     {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 26, arg1},
13
-    {"C_multivariate_hmm", (DL_FUNC) &multivariate_hmm, 19, arg2},
13
+    {"C_multivariate_hmm", (DL_FUNC) &multivariate_hmm, 20, arg2},
14 14
     {"C_univariate_cleanup", (DL_FUNC) &univariate_cleanup, 0, NULL},
15 15
     {"C_multivariate_cleanup", (DL_FUNC) &multivariate_cleanup, 1, arg4},
16 16
     {"C_array2D_which_max", (DL_FUNC) &array2D_which_max, 4, arg5},