Browse code

added pre-fitting capabilities to univariate

chakalakka authored on 10/02/2016 18:32:59
Showing 2 changed files

... ...
@@ -2,10 +2,12 @@
2 2
 #'
3 3
 #' Fit a HMM to a ChIP-seq sample to determine the modification state of genomic regions, e.g. call peaks in the sample.
4 4
 #'
5
-#' The Hidden Markov Model which is used to classify the bins uses 3 states: state 'zero-inflation' with a delta function as emission densitiy (only zero read counts), 'unmodified' and 'modified' with Negative Binomials as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper TODO:insert paper for a detailed description of the method.
5
+#' This function is similar to \code{\link{callPeaksUnivariate}} but allows to pre-fit on a single chromosome instead of the whole genome. This gives a significant performance increase and can help to converge into a better fit in case of unsteady quality for some chromosomes.
6 6
 #'
7 7
 #' @param binned.data A \link{GRanges} object with binned read counts.
8 8
 #' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.
9
+#' @param prefit.on.chr A chromosome that is used to pre-fit the Hidden Markov Model.
10
+#' @param If \code{TRUE}, the second fitting step is only done with one iteration.
9 11
 #' @param eps Convergence threshold for the Baum-Welch algorithm.
10 12
 #' @param init One of the following initialization procedures:
11 13
 #' \describe{
... ...
@@ -23,14 +25,73 @@
23 25
 #' @param read.cutoff.absolute Read counts above this value will be set to the read count specified by this value. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. If option \code{read.cutoff.quantile} is also specified, the minimum of the resulting cutoff values will be used. Set \code{read.cutoff=FALSE} to disable this filtering.
24 26
 #' @param max.mean If \code{mean(reads)>max.mean}, bins with low read counts will be set to 0. This is a workaround to obtain good fits in the case of large bin sizes.
25 27
 #' @param FDR False discovery rate. code{NULL} means that the state with maximum posterior probability will be chosen, irrespective of its absolute probability (default=code{NULL}).
28
+#' @param control If set to \code{TRUE}, the binned data will be treated as control experiment. That means only state 'zero-inflation' and 'unmodified' will be used in the HMM.
26 29
 #' @param keep.posteriors If set to \code{TRUE} (default=\code{FALSE}), posteriors will be available in the output. This is useful to change the FDR later, but increases the necessary disk space to store the result.
30
+#' @param keep.densities If set to \code{TRUE} (default=\code{FALSE}), densities will be available in the output. This should only be needed debugging.
31
+#' @param verbosity Verbosity level for the fitting procedure. 0 - No output, 1 - Iterations are printed.
32
+#' @author Aaron Taudt, Maria Coome Tatche
33
+#' @seealso \code{\link{chromstaR_univariateHMM}}, \code{\link{callPeaksMultivariate}}
34
+#' @examples
35
+#'## Get an example BED-file with ChIP-seq reads for H3K36me3 in brain tissue
36
+#'bedfile <- system.file("extdata/brain/BI.Brain_Angular_Gyrus.H3K36me3.112.chr22.bed.gz",
37
+#'                       package="chromstaR")
38
+#'## Bin the BED file into bin size 1000bp
39
+#'binned.data <- bed2binned(bedfile, assembly='hg19', binsize=1000, save.as.RData=FALSE)
40
+#'## Fit the univariate Hidden Markov Model
41
+#'hmm <- callPeaksUnivariate(binned.data, ID='example_H3K36me3', max.time=60)
42
+#'## Check if the fit is ok
43
+#'plot(hmm, type='histogram')
44
+#' @export
45
+callPeaksUnivariate2 <- function(binned.data, ID, prefit.on.chr=NULL, short=TRUE, eps=0.01, init="standard", max.time=NULL, max.iter=NULL, num.trials=1, eps.try=NULL, num.threads=1, read.cutoff=TRUE, read.cutoff.quantile=1, read.cutoff.absolute=500, max.mean=Inf, FDR=0.5, control=FALSE, keep.posteriors=FALSE, keep.densities=FALSE, verbosity=1) {
46
+
47
+	pre.binned.data <- binned.data[seqnames(binned.data)==prefit.on.chr]
48
+	pre.model <- callPeaksUnivariate(pre.binned.data, ID=ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff=read.cutoff, read.cutoff.quantile=read.cutoff.quantile, read.cutoff.absolute=read.cutoff.absolute, max.mean=max.mean, FDR=FDR, control=control, keep.posteriors=FALSE, keep.densities=FALSE, verbosity=verbosity)
49
+
50
+	if (short) {
51
+		max.iter=1
52
+	}
53
+	model <- pre.model
54
+	model$bins <- binned.data
55
+	model <- callPeaksUnivariate(model, ID=ID, eps=eps, max.time=max.time, max.iter=max.iter, num.threads=num.threads, read.cutoff=read.cutoff, read.cutoff.quantile=read.cutoff.quantile, read.cutoff.absolute=read.cutoff.absolute, max.mean=max.mean, FDR=FDR, control=control, keep.posteriors=keep.posteriors, keep.densities=keep.densities, verbosity=verbosity)
56
+
57
+	return(model)
58
+
59
+}
60
+
61
+
62
+#' Fit a Hidden Markov Model to a ChIP-seq sample.
63
+#'
64
+#' Fit a HMM to a ChIP-seq sample to determine the modification state of genomic regions, e.g. call peaks in the sample.
65
+#'
66
+#' The Hidden Markov Model which is used to classify the bins uses 3 states: state 'zero-inflation' with a delta function as emission densitiy (only zero read counts), 'unmodified' and 'modified' with Negative Binomials as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper TODO:insert paper for a detailed description of the method.
67
+#'
68
+#' @param binned.data A \link{GRanges} object with binned read counts.
69
+#' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.
70
+#' @param eps Convergence threshold for the Baum-Welch algorithm.
71
+#' @param init One of the following initialization procedures:
72
+#' \describe{
73
+#' \item{\code{standard}}{The negative binomial of state 'unmodified' will be initialized with \code{mean=mean(reads)}, \code{var=var(reads)} and the negative binomial of state 'modified' with \code{mean=mean(reads)+1}, \code{var=var(reads)}. This procedure usually gives the fastest convergence.}
74
+#' \item{\code{random}}{Mean and variance of the negative binomials will be initialized with random values (in certain boundaries, see source code). Try this if the \code{'standard'} procedure fails to produce a good fit.}
75
+#' \item{\code{empiric}}{Yet another way to initialize the Baum-Welch. Try this if the other two methods fail to produce a good fit.}
76
+#' }
77
+#' @param max.time The maximum running time in seconds for the Baum-Welch algorithm. If this time is reached, the Baum-Welch will terminate after the current iteration finishes. The default \code{NULL} is no limit.
78
+#' @param max.iter The maximum number of iterations for the Baum-Welch algorithm. The default \code{NULL} is no limit.
79
+#' @param num.trials The number of trials to run the HMM. Each time, the HMM is seeded with different random initial values. The HMM with the best likelihood is given as output.
80
+#' @param eps.try If code num.trials is set to greater than 1, \code{eps.try} is used for the trial runs. If unset, \code{eps} is used.
81
+#' @param num.threads Number of threads to use. Setting this to >1 may give increased performance.
82
+#' @param read.cutoff The default (\code{TRUE}) enables filtering of high read counts. Set \code{read.cutoff=FALSE} to disable this filtering.
83
+#' @param read.cutoff.quantile A quantile between 0 and 1. Should be near 1. Read counts above this quantile will be set to the read count specified by this quantile. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. If option \code{read.cutoff.absolute} is also specified, the minimum of the resulting cutoff values will be used. Set \code{read.cutoff=FALSE} to disable this filtering.
84
+#' @param read.cutoff.absolute Read counts above this value will be set to the read count specified by this value. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. If option \code{read.cutoff.quantile} is also specified, the minimum of the resulting cutoff values will be used. Set \code{read.cutoff=FALSE} to disable this filtering.
85
+#' @param max.mean If \code{mean(reads)>max.mean}, bins with low read counts will be set to 0. This is a workaround to obtain good fits in the case of large bin sizes.
86
+#' @param FDR False discovery rate. code{NULL} means that the state with maximum posterior probability will be chosen, irrespective of its absolute probability (default=code{NULL}).
27 87
 #' @param control If set to \code{TRUE}, the binned data will be treated as control experiment. That means only state 'zero-inflation' and 'unmodified' will be used in the HMM.
88
+#' @param keep.posteriors If set to \code{TRUE} (default=\code{FALSE}), posteriors will be available in the output. This is useful to change the FDR later, but increases the necessary disk space to store the result.
89
+#' @param keep.densities If set to \code{TRUE} (default=\code{FALSE}), densities will be available in the output. This should only be needed debugging.
28 90
 #' @param checkpoint.after.iter Write a checkpoint file every n iterations. The default \code{NULL} means no checkpointing for iterations.
29 91
 #' @param checkpoint.after.time Write a checkpoint file every t seconds. The default \code{NULL} means no checkpointing for time.
30 92
 #' @param checkpoint.file The name of the checkpoint file that will be written.
31 93
 #' @param checkpoint.overwrite If set to \code{TRUE}, only one checkpoint file will be written. If set to \code{FALSE}, a new checkpoint file will be written at each checkpoint with the total number of iterations appended.
32 94
 #' @param checkpoint.use.existing If set to \code{TRUE}, the Baum-Welch fitting procedure will be continued from the HMM in the \code{checkpoint.file}.
33
-#' @param keep.densities If set to \code{TRUE} (default=\code{FALSE}), densities will be available in the output. This should only be needed debugging.
34 95
 #' @param verbosity Verbosity level for the fitting procedure. 0 - No output, 1 - Iterations are printed.
35 96
 #' @author Aaron Taudt, Maria Coome Tatche
36 97
 #' @seealso \code{\link{chromstaR_univariateHMM}}, \code{\link{callPeaksMultivariate}}
... ...
@@ -45,7 +106,7 @@
45 106
 #'## Check if the fit is ok
46 107
 #'plot(hmm, type='histogram')
47 108
 #' @export
48
-callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max.time=NULL, max.iter=NULL, num.trials=1, eps.try=NULL, num.threads=1, read.cutoff=TRUE, read.cutoff.quantile=1, read.cutoff.absolute=500, max.mean=Inf, FDR=0.5, keep.posteriors=FALSE, control=FALSE, checkpoint.after.iter=NULL, checkpoint.after.time=NULL, checkpoint.file=paste0('chromstaR_checkpoint_',ID,'.cpt'), checkpoint.overwrite=TRUE, checkpoint.use.existing=FALSE, keep.densities=FALSE, verbosity=1) {
109
+callPeaksUnivariate <- function(binned.data, ID, eps=0.01, init="standard", max.time=NULL, max.iter=NULL, num.trials=1, eps.try=NULL, num.threads=1, read.cutoff=TRUE, read.cutoff.quantile=1, read.cutoff.absolute=500, max.mean=Inf, FDR=0.5, control=FALSE, keep.posteriors=FALSE, keep.densities=FALSE, checkpoint.after.iter=NULL, checkpoint.after.time=NULL, checkpoint.file=paste0('chromstaR_checkpoint_',ID,'.cpt'), checkpoint.overwrite=TRUE, checkpoint.use.existing=FALSE, verbosity=1) {
49 110
 
50 111
 	### Define cleanup behaviour ###
51 112
 	on.exit(.C("R_univariate_cleanup"))
... ...
@@ -70,17 +70,24 @@ void R_univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* m
70 70
 	hmm->initialize_proba(initial_proba, *use_initial_params);
71 71
     
72 72
 	// Calculate mean and variance of data
73
-	double mean = 0, variance = 0;
73
+	double Tadjust = 0, mean = 0, variance = 0;
74 74
 	for(int t=0; t<*T; t++)
75 75
 	{
76
-		mean+= O[t];
76
+		if (O[t]>0)
77
+		{
78
+			mean += O[t];
79
+			Tadjust += 1;
80
+		}
77 81
 	}
78
-	mean = mean / *T;
82
+	mean = mean / Tadjust;
79 83
 	for(int t=0; t<*T; t++)
80 84
 	{
81
-		variance+= pow(O[t] - mean, 2);
85
+		if (O[t]>0)
86
+		{
87
+			variance += pow(O[t] - mean, 2);
88
+		}
82 89
 	}
83
-	variance = variance / *T;
90
+	variance = variance / Tadjust;
84 91
 	//FILE_LOG(logINFO) << "data mean = " << mean << ", data variance = " << variance;		
85 92
 	if (*verbosity>=1) Rprintf("HMM: data mean = %g, data variance = %g\n", mean, variance);		
86 93