... | ... |
@@ -1,6 +1,7 @@ |
1 | 1 |
# Generated by roxygen2 (4.1.0): do not edit by hand |
2 | 2 |
|
3 | 3 |
S3method(plot,GRanges) |
4 |
+S3method(plot,aneuBiHMM) |
|
4 | 5 |
S3method(plot,aneuHMM) |
5 | 6 |
S3method(plot,character) |
6 | 7 |
export(bam2binned) |
... | ... |
@@ -8,7 +9,9 @@ export(bed2binned) |
8 | 9 |
export(bedGraph2binned) |
9 | 10 |
export(exportCNVs) |
10 | 11 |
export(exportReadCounts) |
12 |
+export(filterSegments) |
|
11 | 13 |
export(findCNVs) |
14 |
+export(findSCEs) |
|
12 | 15 |
export(getChromLengths) |
13 | 16 |
export(heatmapAneuploidies) |
14 | 17 |
export(heatmapGenomeWide) |
... | ... |
@@ -137,18 +137,8 @@ align2binned <- function(file, format, index=file, pairedEndReads=FALSE, chrom.l |
137 | 137 |
classes[c(1:3,6)] <- classes.in.bed[c(1:3,6)] |
138 | 138 |
data <- read.table(file, colClasses=classes) |
139 | 139 |
# Convert to GRanges object |
140 |
- data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]+1), strand=Rle(strand("*"), nrow(data))) # +1 to match coordinate systems |
|
141 |
- tC <- tryCatch({ |
|
142 |
- seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
|
143 |
- }, warning = function(war) { |
|
144 |
- suppressWarnings(seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))])) |
|
145 |
- offending.chroms <- unique(names(which(end(data) > seqlengths(data)[as.character(seqnames(data))]))) |
|
146 |
- if (war$message=="'ranges' contains values outside of sequence bounds. See ?trim to subset ranges.") { |
|
147 |
- warning(paste0("File \"",file,"\" contains reads outside of sequence bounds. The offending chromosomes were \"",paste(offending.chroms, collapse=', '),"\". Please consider using a different reference assembly.")) |
|
148 |
- } else { |
|
149 |
- print(war) |
|
150 |
- } |
|
151 |
- }) |
|
140 |
+ data <- GenomicRanges::GRanges(seqnames=Rle(data[,1]), ranges=IRanges(start=data[,2]+1, end=data[,3]), strand=Rle(strand("*"), nrow(data))) # start+1 to go from [0,x) -> [1,x] |
|
141 |
+ seqlengths(data) <- as.integer(chrom.lengths[names(seqlengths(data))]) |
|
152 | 142 |
chroms.in.data <- seqlevels(data) |
153 | 143 |
## BAM (1-based) |
154 | 144 |
} else if (format == "bam") { |
... | ... |
@@ -159,8 +149,11 @@ align2binned <- function(file, format, index=file, pairedEndReads=FALSE, chrom.l |
159 | 149 |
chromosomes <- chroms.in.data |
160 | 150 |
} |
161 | 151 |
chroms2use <- intersect(chromosomes, chroms.in.data) |
162 |
- gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use), |
|
163 |
- ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use])) |
|
152 |
+ if (length(chroms2use)==0) { |
|
153 |
+ chrstring <- paste0(chromosomes, collapse=', ') |
|
154 |
+ stop('The specified chromosomes ', chrstring, ' do not exist in the data.') |
|
155 |
+ } |
|
156 |
+ gr <- GenomicRanges::GRanges(seqnames=Rle(chroms2use), ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use])) |
|
164 | 157 |
if (calc.complexity || !remove.duplicate.reads) { |
165 | 158 |
if (pairedEndReads) { |
166 | 159 |
data <- GenomicAlignments::readGAlignmentPairsFromBam(file, index=index, param=ScanBamParam(which=range(gr))) |
... | ... |
@@ -202,12 +195,12 @@ align2binned <- function(file, format, index=file, pairedEndReads=FALSE, chrom.l |
202 | 195 |
diff <- setdiff(chromosomes, chroms.in.data) |
203 | 196 |
if (length(diff)>0) { |
204 | 197 |
diffs <- paste0(diff, collapse=', ') |
205 |
- warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data')) |
|
198 |
+ warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.')) |
|
206 | 199 |
} |
207 | 200 |
diff <- setdiff(chromosomes, names(chrom.lengths)) |
208 | 201 |
if (length(diff)>0) { |
209 | 202 |
diffs <- paste0(diff, collapse=', ') |
210 |
- warning(paste0('Not using chromosomes ', diffs, ' because no lengths could be found')) |
|
203 |
+ warning(paste0('Not using chromosomes ', diffs, ' because no lengths could be found.')) |
|
211 | 204 |
} |
212 | 205 |
chroms2use <- intersect(chromosomes, chroms.in.data) |
213 | 206 |
chroms2use <- intersect(chroms2use, names(chrom.lengths)) |
... | ... |
@@ -282,11 +275,13 @@ align2binned <- function(file, format, index=file, pairedEndReads=FALSE, chrom.l |
282 | 275 |
|
283 | 276 |
## Complexity estimation with preseqR |
284 | 277 |
complexity.preseqR.fit <- preseqR::preseqR.rfa.curve(multiplicity[['1']]) |
285 |
- complexity.preseqR <- as.numeric(preseqR::preseqR.rfa.estimate(complexity.preseqR.fit$continued.fraction, 1e100) + total.reads.unique['1']) |
|
286 |
- if (!is.null(complexity.preseqR.fit)) { |
|
287 |
- complexity.preseqR.ggplt <- ggplot(as.data.frame(complexity.preseqR.fit$estimates)) + geom_line(aes_string(x='sample.size', y='yield.estimate')) + geom_point(data=df, aes_string(x='x', y='y'), size=5) + ggtitle('preseqR complexity estimation') + theme_bw() + xlab('total number of reads') + ylab('unique reads') |
|
288 |
- } else { |
|
278 |
+ if (is.null(complexity.preseqR.fit)) { |
|
279 |
+ complexity.preseqR <- NA |
|
280 |
+ complexity.preseqR.ggplt <- NA |
|
289 | 281 |
warning("Complexity estimation with preseqR failed.") |
282 |
+ } else { |
|
283 |
+ complexity.preseqR <- as.numeric(preseqR::preseqR.rfa.estimate(complexity.preseqR.fit$continued.fraction, 1e100) + total.reads.unique['1']) |
|
284 |
+ complexity.preseqR.ggplt <- ggplot(as.data.frame(complexity.preseqR.fit$estimates)) + geom_line(aes_string(x='sample.size', y='yield.estimate')) + geom_point(data=df, aes_string(x='x', y='y'), size=5) + ggtitle('preseqR complexity estimation') + theme_bw() + xlab('total number of reads') + ylab('unique reads') |
|
290 | 285 |
} |
291 | 286 |
|
292 | 287 |
## Complexity estimation with Michaelis-Menten |
... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
#' Find copy number variations |
2 | 2 |
#' |
3 |
-#' \code{findCNVs} classifies the input binned read counts into several states which represent copy-number-variation. |
|
3 |
+#' \code{findCNVs} classifies the binned read counts into several states which represent copy-number-variation. |
|
4 | 4 |
#' |
5 | 5 |
#' \code{findCNVs} uses a 6-state Hidden Markov Model to classify the binned read counts: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper |
6 | 6 |
#' @author Aaron Taudt |
... | ... |
@@ -42,7 +42,7 @@ findCNVs <- function(binned.data, ID, method='univariate', eps=0.001, init="stan |
42 | 42 |
|
43 | 43 |
#' Find copy number variations (univariate) |
44 | 44 |
#' |
45 |
-#' \code{findCNVs} classifies the input binned read counts into several states which represent copy-number-variation. |
|
45 |
+#' \code{findCNVs} classifies the binned read counts into several states which represent copy-number-variation. |
|
46 | 46 |
#' |
47 | 47 |
#' @param binned.data A \link{GRanges} object with binned read counts. |
48 | 48 |
#' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example. |
... | ... |
@@ -149,10 +149,9 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
149 | 149 |
A.initial <- matrix(runif(numstates^2), ncol=numstates) |
150 | 150 |
A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/") |
151 | 151 |
proba.initial <- runif(numstates) |
152 |
- size.initial <- runif(1, min=0, max=100) * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F)) |
|
153 |
- prob.initial <- runif(1) * ifelse(state.distributions=='dnbinom', T, F) |
|
154 |
- prob.initial[state.distributions=='dgeom'] <- runif(1) |
|
155 |
- lambda.initial <- runif(1, min=0, max=100) * cumsum(ifelse(state.distributions=='dpois', T, F)) |
|
152 |
+ # Distributions for dependent states |
|
153 |
+ size.initial <- runif(1, min=0, max=100) * cumsum(dependent.states.mask) |
|
154 |
+ prob.initial <- runif(1) * dependent.states.mask |
|
156 | 155 |
} else if (init == 'standard') { |
157 | 156 |
A.initial <- matrix(NA, ncol=numstates, nrow=numstates) |
158 | 157 |
for (irow in 1:numstates) { |
... | ... |
@@ -162,18 +161,13 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
162 | 161 |
} |
163 | 162 |
} |
164 | 163 |
proba.initial <- rep(1/numstates, numstates) |
165 |
- mean.initial <- mean(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F)) |
|
166 |
- var.initial <- var(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dnbinom' | state.distributions=='dbinom', T, F)) |
|
164 |
+ mean.initial <- mean(reads[reads>0])/2 * cumsum(dependent.states.mask) |
|
165 |
+ var.initial <- var(reads[reads>0])/2 * cumsum(dependent.states.mask) |
|
167 | 166 |
size.initial <- rep(0,numstates) |
168 | 167 |
prob.initial <- rep(0,numstates) |
169 |
- mask <- state.distributions=='dnbinom' |
|
168 |
+ mask <- dependent.states.mask |
|
170 | 169 |
size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
171 | 170 |
prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
172 |
- mask <- state.distributions=='dbinom' |
|
173 |
- size.initial[mask] <- dbinom.size(mean.initial[mask], var.initial[mask]) |
|
174 |
- prob.initial[mask] <- dbinom.prob(mean.initial[mask], var.initial[mask]) |
|
175 |
- prob.initial[state.distributions=='dgeom'] <- 0.9 |
|
176 |
- lambda.initial <- mean(reads[reads>0])/2 * cumsum(ifelse(state.distributions=='dpois', T, F)) |
|
177 | 171 |
} |
178 | 172 |
|
179 | 173 |
hmm <- .C("R_univariate_hmm", |
... | ... |
@@ -183,7 +177,6 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
183 | 177 |
state.labels = as.integer(state.labels), # int* state_labels |
184 | 178 |
size = double(length=numstates), # double* size |
185 | 179 |
prob = double(length=numstates), # double* prob |
186 |
- lambda = double(length=numstates), # double* lambda |
|
187 | 180 |
num.iterations = as.integer(max.iter), # int* maxiter |
188 | 181 |
time.sec = as.integer(max.time), # double* maxtime |
189 | 182 |
loglik.delta = as.double(eps.try), # double* eps |
... | ... |
@@ -195,7 +188,6 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
195 | 188 |
distr.type = as.integer(state.distributions), # int* distr_type |
196 | 189 |
size.initial = as.vector(size.initial), # double* initial_size |
197 | 190 |
prob.initial = as.vector(prob.initial), # double* initial_prob |
198 |
- lambda.initial = as.vector(lambda.initial), # double* initial_lambda |
|
199 | 191 |
A.initial = as.vector(A.initial), # double* initial_A |
200 | 192 |
proba.initial = as.vector(proba.initial), # double* initial_proba |
201 | 193 |
use.initial.params = as.logical(1), # bool* use_initial_params |
... | ... |
@@ -242,7 +234,6 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
242 | 234 |
state.labels = as.integer(state.labels), # int* state_labels |
243 | 235 |
size = double(length=numstates), # double* size |
244 | 236 |
prob = double(length=numstates), # double* prob |
245 |
- lambda = double(length=numstates), # double* lambda |
|
246 | 237 |
num.iterations = as.integer(max.iter), # int* maxiter |
247 | 238 |
time.sec = as.integer(max.time), # double* maxtime |
248 | 239 |
loglik.delta = as.double(eps), # double* eps |
... | ... |
@@ -254,7 +245,6 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
254 | 245 |
distr.type = as.integer(state.distributions), # int* distr_type |
255 | 246 |
size.initial = as.vector(hmm$size), # double* initial_size |
256 | 247 |
prob.initial = as.vector(hmm$prob), # double* initial_prob |
257 |
- lambda.initial = as.vector(hmm$lambda), # double* initial_lambda |
|
258 | 248 |
A.initial = as.vector(hmm$A), # double* initial_A |
259 | 249 |
proba.initial = as.vector(hmm$proba), # double* initial_proba |
260 | 250 |
use.initial.params = as.logical(1), # bool* use_initial_params |
... | ... |
@@ -310,20 +300,17 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
310 | 300 |
for (idistr in 1:length(hmm$distr.type)) { |
311 | 301 |
distr <- levels(state.distributions)[hmm$distr.type[idistr]] |
312 | 302 |
if (distr == 'dnbinom') { |
313 |
- distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
314 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
303 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
304 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
315 | 305 |
} else if (distr == 'dgeom') { |
316 |
- distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], lambda=NA, mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr]))) |
|
317 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], lambda=NA, mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr]))) |
|
306 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr]))) |
|
307 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr]))) |
|
318 | 308 |
} else if (distr == 'delta') { |
319 |
- distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0)) |
|
320 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0)) |
|
321 |
- } else if (distr == 'dpois') { |
|
322 |
- distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda[idistr], mu=hmm$lambda[idistr], variance=hmm$lambda[idistr])) |
|
323 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda.initial[idistr], mu=hmm$lambda.initial[idistr], variance=hmm$lambda.initial[idistr])) |
|
309 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0)) |
|
310 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0)) |
|
324 | 311 |
} else if (distr == 'dbinom') { |
325 |
- distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
326 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
312 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
313 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
327 | 314 |
} |
328 | 315 |
} |
329 | 316 |
rownames(distributions) <- state.labels |
... | ... |
@@ -426,7 +413,7 @@ bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max. |
426 | 413 |
|
427 | 414 |
## Pre-compute z-values for each number of reads |
428 | 415 |
message("Computing pre z-matrix...", appendLF=F) |
429 |
- z.per.read <- array(NA, dim=c(maxreads+1, num.models, num.uni.states)) |
|
416 |
+ z.per.read <- array(NA, dim=c(maxreads+1, num.models, num.uni.states), dimnames=list(reads=0:maxreads, model=c('+','-'), state=state.labels)) |
|
430 | 417 |
xreads <- 0:maxreads |
431 | 418 |
for (istrand in 1:num.models) { |
432 | 419 |
model <- models[[istrand]] |
... | ... |
@@ -437,6 +424,9 @@ bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max. |
437 | 424 |
u <- pnbinom(xreads, size, prob) |
438 | 425 |
} else if (model$distributions[istate,'type']=='delta') { |
439 | 426 |
u <- rep(1, length(xreads)) |
427 |
+ } else if (model$distributions[istate,'type']=='dgeom') { |
|
428 |
+ prob <- model$distributions[istate,'prob'] |
|
429 |
+ u <- pgeom(xreads, prob) |
|
440 | 430 |
} |
441 | 431 |
qnorm_u <- qnorm(u) |
442 | 432 |
mask <- qnorm_u==Inf |
... | ... |
@@ -478,7 +468,7 @@ bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max. |
478 | 468 |
correlationMatrix[,,state] <- cor(z.temp) |
479 | 469 |
determinant[state] <- det( correlationMatrix[,,state] ) |
480 | 470 |
correlationMatrixInverse[,,state] <- solve(correlationMatrix[,,state]) |
481 |
- if (length(z.temp)>0) { |
|
471 |
+ if (!any(is.na(correlationMatrixInverse[,,state]))) { |
|
482 | 472 |
usestateTF[state] <- TRUE |
483 | 473 |
} else { |
484 | 474 |
usestateTF[state] <- FALSE |
485 | 475 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,792 @@ |
1 |
+#' Find sister chromatid exchanges |
|
2 |
+#' |
|
3 |
+#' \code{findSCEs} classifies the binned read counts into several states which represent the number of chromatids on each strand. |
|
4 |
+#' |
|
5 |
+#' \code{findSCEs} uses a 6-state Hidden Markov Model to classify the binned read counts: state 'nullsomy' with a delta function as emission densitiy (only zero read counts), 'monosomy','disomy','trisomy','tetrasomy' and 'multisomy' with negative binomials (see \code{\link{dnbinom}}) as emission densities. A Baum-Welch algorithm is employed to estimate the parameters of the distributions. See our paper for a detailed description of the method. TODO: insert paper |
|
6 |
+#' @author Aaron Taudt |
|
7 |
+#' @inheritParams univariate.findSCEs |
|
8 |
+#' @return An \code{\link{aneuBiHMM}} object. |
|
9 |
+#' @examples |
|
10 |
+#'## Get an example BAM file with single-cell-sequencing reads |
|
11 |
+#'bamfile <- system.file("extdata/BB140820_I_002.bam", package="aneufinder") |
|
12 |
+#'## Bin the BAM file into bin size 200000bp |
|
13 |
+#'binned.data <- bam2binned(bamfile, binsize=200000, chromosomes=c(1:22,'X','Y'), GC.correction=FALSE, |
|
14 |
+#' save.as.RData=FALSE) |
|
15 |
+#'## Fit the Hidden Markov Model |
|
16 |
+#'model <- findSCEs(binned.data, ID=basename(bamfile), eps=0.1, max.time=60) |
|
17 |
+#'## Check the fit |
|
18 |
+#'plot(model, type='histogram') |
|
19 |
+#' @export |
|
20 |
+findSCEs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=10, eps.try=10*eps, num.threads=1, read.cutoff.quantile=0.999, GC.correction=TRUE, strand='*', allow.odd.states=FALSE) { |
|
21 |
+ |
|
22 |
+ call <- match.call() |
|
23 |
+ underline <- paste0(rep('=',sum(nchar(call[[1]]))+3), collapse='') |
|
24 |
+ message("\n",call[[1]],"():") |
|
25 |
+ message(underline) |
|
26 |
+ ptm <- proc.time() |
|
27 |
+ message("Find CNVs for ID = ",ID, ":") |
|
28 |
+ |
|
29 |
+ model <- bivariate.findSCEs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff.quantile=read.cutoff.quantile, GC.correction=GC.correction, allow.odd.states=allow.odd.states) |
|
30 |
+ |
|
31 |
+ attr(model, 'call') <- call |
|
32 |
+ time <- proc.time() - ptm |
|
33 |
+ message("Time spent in ", call[[1]],"(): ",round(time[3],2),"s") |
|
34 |
+ return(model) |
|
35 |
+ |
|
36 |
+} |
|
37 |
+ |
|
38 |
+ |
|
39 |
+#' Find copy number variations (univariate) |
|
40 |
+#' |
|
41 |
+#' \code{findSCEs} classifies the input binned read counts into several states which represent copy-number-variation. |
|
42 |
+#' |
|
43 |
+#' @param binned.data A \link{GRanges} object with binned read counts. |
|
44 |
+#' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example. |
|
45 |
+#' @param eps Convergence threshold for the Baum-Welch algorithm. |
|
46 |
+#' @param init One of the following initialization procedures: |
|
47 |
+#' \describe{ |
|
48 |
+#' \item{\code{standard}}{The negative binomial of state 'disomy' will be initialized with \code{mean=mean(reads)}, \code{var=var(reads)}. This procedure usually gives good convergence.} |
|
49 |
+#' \item{\code{random}}{Mean and variance of the negative binomial of state 'disomy' will be initialized with random values (in certain boundaries, see source code). Try this if the \code{standard} procedure fails to produce a good fit.} |
|
50 |
+#' } |
|
51 |
+#' @param max.time The maximum running time in seconds for the Baum-Welch algorithm. If this time is reached, the Baum-Welch will terminate after the current iteration finishes. The default -1 is no limit. |
|
52 |
+#' @param max.iter The maximum number of iterations for the Baum-Welch algorithm. The default -1 is no limit. |
|
53 |
+#' @param num.trials The number of trials to find a fit where state 'disomic' is most frequent. Each time, the HMM is seeded with different random initial values. |
|
54 |
+#' @param eps.try If code num.trials is set to greater than 1, \code{eps.try} is used for the trial runs. If unset, \code{eps} is used. |
|
55 |
+#' @param num.threads Number of threads to use. Setting this to >1 may give increased performance. |
|
56 |
+#' @param read.cutoff.quantile A quantile between 0 and 1. Should be near 1. Read counts above this quantile will be set to the read count specified by this quantile. Filtering very high read counts increases the performance of the Baum-Welch fitting procedure. However, if your data contains very few peaks they might be filtered out. Set \code{read.cutoff.quantile=1} in this case. |
|
57 |
+#' @param GC.correction Either \code{TRUE} or \code{FALSE}. If \code{GC.correction=TRUE}, the GC corrected reads have to be present in the input \code{binned.data}, otherwise a warning is thrown and no GC correction is done. |
|
58 |
+#' @param strand Run the HMM only for the specified strand. One of \code{c('+', '-', '*')}. |
|
59 |
+#' @return An \code{\link{aneuHMM}} object. |
|
60 |
+univariate.findSCEs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, read.cutoff.quantile=0.999, GC.correction=TRUE, strand='*') { |
|
61 |
+ |
|
62 |
+ ### Define cleanup behaviour ### |
|
63 |
+ on.exit(.C("R_univariate_cleanup")) |
|
64 |
+ |
|
65 |
+ ## Intercept user input |
|
66 |
+ IDcheck <- ID #trigger error if not defined |
|
67 |
+ if (class(binned.data) != 'GRanges') { |
|
68 |
+ binned.data <- get(load(binned.data)) |
|
69 |
+ if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'reads' or a file that contains such an object") |
|
70 |
+ } |
|
71 |
+ if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric") |
|
72 |
+ if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer") |
|
73 |
+ if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer") |
|
74 |
+ if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer") |
|
75 |
+ if (!is.null(eps.try)) { |
|
76 |
+ if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric") |
|
77 |
+ } |
|
78 |
+ if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer") |
|
79 |
+ if (check.logical(GC.correction)!=0) stop("argument 'GC.correction' expects a logical (TRUE or FALSE)") |
|
80 |
+ if (check.strand(strand)!=0) stop("argument 'strand' expects either '+', '-' or '*'") |
|
81 |
+ |
|
82 |
+ warlist <- list() |
|
83 |
+ if (is.null(eps.try)) eps.try <- eps |
|
84 |
+ |
|
85 |
+ ## Assign variables |
|
86 |
+# state.labels.SCE # assigned in global.R |
|
87 |
+ numstates <- length(state.labels.SCE) |
|
88 |
+ numbins <- length(binned.data) |
|
89 |
+ iniproc <- which(init==c("standard","random")) # transform to int |
|
90 |
+ if (strand=='+') { |
|
91 |
+ select <- 'preads' |
|
92 |
+ } else if (strand=='-') { |
|
93 |
+ select <- 'mreads' |
|
94 |
+ } else if (strand=='*') { |
|
95 |
+ select <- 'reads' |
|
96 |
+ } |
|
97 |
+ if (GC.correction) { |
|
98 |
+ if (!(paste0(select,'.gc') %in% names(mcols(binned.data)))) { |
|
99 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because they are not in the binned data. Continuing without GC-correction.")) |
|
100 |
+ } else if (any(is.na(mcols(binned.data)[,paste0(select,'.gc')]))) { |
|
101 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because there are NAs. GC-correction may not be reliable. Continuing without GC-correction.")) |
|
102 |
+ } else if (any(mcols(binned.data)[,paste0(select,'.gc')]<0)) { |
|
103 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because there are negative values. GC-correction is not reliable. Continuing without GC-correction.")) |
|
104 |
+ } else { |
|
105 |
+ select <- paste0(select,'.gc') |
|
106 |
+ } |
|
107 |
+ } |
|
108 |
+ reads <- mcols(binned.data)[,select] |
|
109 |
+ |
|
110 |
+ ## Make return object |
|
111 |
+ result <- list() |
|
112 |
+ class(result) <- class.univariate.hmm |
|
113 |
+ result$ID <- ID |
|
114 |
+ result$bins <- binned.data |
|
115 |
+ |
|
116 |
+ # Check if there are reads in the data, otherwise HMM will blow up |
|
117 |
+ if (!any(reads!=0)) { |
|
118 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All reads in data are zero. No HMM done.")) |
|
119 |
+ result$warnings <- warlist |
|
120 |
+ return(result) |
|
121 |
+ } else if (any(reads<0)) { |
|
122 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some reads in data are negative. No HMM done.")) |
|
123 |
+ result$warnings <- warlist |
|
124 |
+ return(result) |
|
125 |
+ } |
|
126 |
+ |
|
127 |
+ |
|
128 |
+ # Filter high reads out, makes HMM faster |
|
129 |
+ read.cutoff <- quantile(reads, read.cutoff.quantile) |
|
130 |
+ names.read.cutoff <- names(read.cutoff) |
|
131 |
+ read.cutoff <- ceiling(read.cutoff) |
|
132 |
+ mask <- reads > read.cutoff |
|
133 |
+ reads[mask] <- read.cutoff |
|
134 |
+ numfiltered <- length(which(mask)) |
|
135 |
+ if (numfiltered > 0) { |
|
136 |
+ message(paste0("Replaced read counts > ",read.cutoff," (",names.read.cutoff," quantile) by ",read.cutoff," in ",numfiltered," bins. Set option 'read.cutoff.quantile=1' to disable this filtering. This filtering was done to increase the speed of the HMM and should not affect the results.\n")) |
|
137 |
+ } |
|
138 |
+ |
|
139 |
+ ## Call univariate in a for loop to enable multiple trials |
|
140 |
+ modellist <- list() |
|
141 |
+ for (i_try in 1:num.trials) { |
|
142 |
+ message(paste0("Trial ",i_try," / ",num.trials)) |
|
143 |
+ |
|
144 |
+ ## Initial parameters |
|
145 |
+ if (init == 'random') { |
|
146 |
+ A.initial <- matrix(runif(numstates^2), ncol=numstates) |
|
147 |
+ A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/") |
|
148 |
+ proba.initial <- runif(numstates) |
|
149 |
+ # Distributions for dependent states |
|
150 |
+ size.initial <- runif(1, min=0, max=100) * cumsum(dependent.states.mask.SCE) |
|
151 |
+ prob.initial <- runif(1) * dependent.states.mask.SCE |
|
152 |
+ } else if (init == 'standard') { |
|
153 |
+ A.initial <- matrix(NA, ncol=numstates, nrow=numstates) |
|
154 |
+ for (irow in 1:numstates) { |
|
155 |
+ for (icol in 1:numstates) { |
|
156 |
+ if (irow==icol) { A.initial[irow,icol] <- 0.9 } |
|
157 |
+ else { A.initial[irow,icol] <- 0.1/(numstates-1) } |
|
158 |
+ } |
|
159 |
+ } |
|
160 |
+ proba.initial <- rep(1/numstates, numstates) |
|
161 |
+ mean.initial <- mean(reads[reads>0]) * cumsum(dependent.states.mask.SCE) |
|
162 |
+ var.initial <- var(reads[reads>0]) * cumsum(dependent.states.mask.SCE) |
|
163 |
+ size.initial <- rep(0,numstates) |
|
164 |
+ prob.initial <- rep(0,numstates) |
|
165 |
+ mask <- dependent.states.mask.SCE |
|
166 |
+ size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
|
167 |
+ prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
|
168 |
+ } |
|
169 |
+ # Assign initials for the nullsomy distribution |
|
170 |
+ size.initial[2] <- 1 |
|
171 |
+ prob.initial[2] <- 0.5 |
|
172 |
+ |
|
173 |
+ hmm <- .C("R_univariate_hmm", |
|
174 |
+ reads = as.integer(reads), # int* O |
|
175 |
+ num.bins = as.integer(numbins), # int* T |
|
176 |
+ num.states = as.integer(numstates), # int* N |
|
177 |
+ state.labels.SCE = as.integer(state.labels.SCE), # int* state.labels.SCE |
|
178 |
+ size = double(length=numstates), # double* size |
|
179 |
+ prob = double(length=numstates), # double* prob |
|
180 |
+ num.iterations = as.integer(max.iter), # int* maxiter |
|
181 |
+ time.sec = as.integer(max.time), # double* maxtime |
|
182 |
+ loglik.delta = as.double(eps.try), # double* eps |
|
183 |
+ states = integer(length=numbins), # int* states |
|
184 |
+ A = double(length=numstates*numstates), # double* A |
|
185 |
+ proba = double(length=numstates), # double* proba |
|
186 |
+ loglik = double(length=1), # double* loglik |
|
187 |
+ weights = double(length=numstates), # double* weights |
|
188 |
+ distr.type = as.integer(state.distributions.SCE), # int* distr_type |
|
189 |
+ size.initial = as.vector(size.initial), # double* initial_size |
|
190 |
+ prob.initial = as.vector(prob.initial), # double* initial_prob |
|
191 |
+ A.initial = as.vector(A.initial), # double* initial_A |
|
192 |
+ proba.initial = as.vector(proba.initial), # double* initial_proba |
|
193 |
+ use.initial.params = as.logical(1), # bool* use_initial_params |
|
194 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
195 |
+ error = as.integer(0), # int* error (error handling) |
|
196 |
+ read.cutoff = as.integer(read.cutoff) # int* read_cutoff |
|
197 |
+ ) |
|
198 |
+ |
|
199 |
+ hmm$eps <- eps.try |
|
200 |
+ if (num.trials > 1) { |
|
201 |
+ if (hmm$loglik.delta > hmm$eps) { |
|
202 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge in trial run ",i_try,"!\n")) |
|
203 |
+ } |
|
204 |
+ # Store model in list |
|
205 |
+ hmm$reads <- NULL |
|
206 |
+ modellist[[i_try]] <- hmm |
|
207 |
+ # Check if monosomic is more frequent than trisomic |
|
208 |
+ imono <- which(state.labels.SCE=='monosomy') |
|
209 |
+ itri <- which(state.labels.SCE=='trisomy') |
|
210 |
+ if (hmm$weights[imono]>hmm$weights[itri]) { |
|
211 |
+ break |
|
212 |
+ } |
|
213 |
+ init <- 'random' |
|
214 |
+ } else if (num.trials == 1) { |
|
215 |
+ if (hmm$loglik.delta > eps) { |
|
216 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge!\n")) |
|
217 |
+ } |
|
218 |
+ } |
|
219 |
+ } |
|
220 |
+ |
|
221 |
+ if (num.trials > 1) { |
|
222 |
+ |
|
223 |
+ # Select fit with highest weight in state disomy |
|
224 |
+ # Mathematically we should select the fit with highest loglikelihood. If we think the fit with the highest loglikelihood is incorrect, we should change the underlying model. However, this is very complex and we choose to select a fit that we think is (more) correct, although it has not the highest support given our (imperfect) model. |
|
225 |
+ istate <- which(state.labels.SCE=='disomy') |
|
226 |
+ indexmax <- which.max(unlist(lapply(lapply(modellist,'[[','weights'), '[', istate))) |
|
227 |
+ hmm <- modellist[[indexmax]] |
|
228 |
+ |
|
229 |
+ # Rerun the HMM with different epsilon and initial parameters from trial run |
|
230 |
+ message(paste0("Rerunning trial ",indexmax," with eps = ",eps)) |
|
231 |
+ hmm <- .C("R_univariate_hmm", |
|
232 |
+ reads = as.integer(reads), # int* O |
|
233 |
+ num.bins = as.integer(numbins), # int* T |
|
234 |
+ num.states = as.integer(numstates), # int* N |
|
235 |
+ state.labels.SCE = as.integer(state.labels.SCE), # int* state.labels.SCE |
|
236 |
+ size = double(length=numstates), # double* size |
|
237 |
+ prob = double(length=numstates), # double* prob |
|
238 |
+ num.iterations = as.integer(max.iter), # int* maxiter |
|
239 |
+ time.sec = as.integer(max.time), # double* maxtime |
|
240 |
+ loglik.delta = as.double(eps), # double* eps |
|
241 |
+ states = integer(length=numbins), # int* states |
|
242 |
+ A = double(length=numstates*numstates), # double* A |
|
243 |
+ proba = double(length=numstates), # double* proba |
|
244 |
+ loglik = double(length=1), # double* loglik |
|
245 |
+ weights = double(length=numstates), # double* weights |
|
246 |
+ distr.type = as.integer(state.distributions.SCE), # int* distr_type |
|
247 |
+ size.initial = as.vector(hmm$size), # double* initial_size |
|
248 |
+ prob.initial = as.vector(hmm$prob), # double* initial_prob |
|
249 |
+ A.initial = as.vector(hmm$A), # double* initial_A |
|
250 |
+ proba.initial = as.vector(hmm$proba), # double* initial_proba |
|
251 |
+ use.initial.params = as.logical(1), # bool* use_initial_params |
|
252 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
253 |
+ error = as.integer(0), # int* error (error handling) |
|
254 |
+ read.cutoff = as.integer(read.cutoff) # int* read_cutoff |
|
255 |
+ ) |
|
256 |
+ } |
|
257 |
+ |
|
258 |
+ ### Make return object ### |
|
259 |
+ ## Check for errors |
|
260 |
+ if (hmm$error == 0) { |
|
261 |
+ ## Bin coordinates and states ### |
|
262 |
+ result$bins$state <- state.labels.SCE[hmm$states] |
|
263 |
+ ## Segmentation |
|
264 |
+ message("Making segmentation ...", appendLF=F) |
|
265 |
+ ptm <- proc.time() |
|
266 |
+ gr <- result$bins |
|
267 |
+ red.gr.list <- GRangesList() |
|
268 |
+ for (state in state.labels.SCE) { |
|
269 |
+ red.gr <- GenomicRanges::reduce(gr[gr$state==state]) |
|
270 |
+ mcols(red.gr)$state <- rep(factor(state, levels=levels(state.labels.SCE)),length(red.gr)) |
|
271 |
+ if (length(red.gr) > 0) { |
|
272 |
+ red.gr.list[[length(red.gr.list)+1]] <- red.gr |
|
273 |
+ } |
|
274 |
+ } |
|
275 |
+ red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list)) |
|
276 |
+ result$segments <- red.gr |
|
277 |
+ seqlengths(result$segments) <- seqlengths(binned.data) |
|
278 |
+ time <- proc.time() - ptm |
|
279 |
+ message(" ",round(time[3],2),"s") |
|
280 |
+ ## Parameters |
|
281 |
+ # Weights |
|
282 |
+ result$weights <- hmm$weights |
|
283 |
+ names(result$weights) <- state.labels.SCE |
|
284 |
+ # Transition matrices |
|
285 |
+ transitionProbs <- matrix(hmm$A, ncol=hmm$num.states) |
|
286 |
+ rownames(transitionProbs) <- state.labels.SCE |
|
287 |
+ colnames(transitionProbs) <- state.labels.SCE |
|
288 |
+ result$transitionProbs <- transitionProbs |
|
289 |
+ transitionProbs.initial <- matrix(hmm$A.initial, ncol=hmm$num.states) |
|
290 |
+ rownames(transitionProbs.initial) <- state.labels.SCE |
|
291 |
+ colnames(transitionProbs.initial) <- state.labels.SCE |
|
292 |
+ result$transitionProbs.initial <- transitionProbs.initial |
|
293 |
+ # Initial probs |
|
294 |
+ result$startProbs <- hmm$proba |
|
295 |
+ names(result$startProbs) <- paste0("P(",state.labels.SCE,")") |
|
296 |
+ result$startProbs.initial <- hmm$proba.initial |
|
297 |
+ names(result$startProbs.initial) <- paste0("P(",state.labels.SCE,")") |
|
298 |
+ # Distributions |
|
299 |
+ distributions <- data.frame() |
|
300 |
+ distributions.initial <- data.frame() |
|
301 |
+ for (idistr in 1:length(hmm$distr.type)) { |
|
302 |
+ distr <- levels(state.distributions.SCE)[hmm$distr.type[idistr]] |
|
303 |
+ if (distr == 'dnbinom') { |
|
304 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
305 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
306 |
+ } else if (distr == 'dgeom') { |
|
307 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr]))) |
|
308 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr]))) |
|
309 |
+ } else if (distr == 'delta') { |
|
310 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0)) |
|
311 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0)) |
|
312 |
+ } else if (distr == 'dbinom') { |
|
313 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
314 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
315 |
+ } |
|
316 |
+ } |
|
317 |
+ rownames(distributions) <- state.labels.SCE |
|
318 |
+ rownames(distributions.initial) <- state.labels.SCE |
|
319 |
+ result$distributions <- distributions |
|
320 |
+ result$distributions.initial <- distributions |
|
321 |
+ ## Convergence info |
|
322 |
+ convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error) |
|
323 |
+ result$convergenceInfo <- convergenceInfo |
|
324 |
+ ## GC correction |
|
325 |
+ result$GC.correction <- grepl('gc', select) |
|
326 |
+ ## Quality info |
|
327 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(reads), spikyness=qc.spikyness(reads), complexity=attr(result$bins, 'complexity.preseqR')) |
|
328 |
+ result$qualityInfo <- qualityInfo |
|
329 |
+ } else if (hmm$error == 1) { |
|
330 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'read.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
|
331 |
+ } else if (hmm$error == 2) { |
|
332 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library")) |
|
333 |
+ } |
|
334 |
+ |
|
335 |
+ ## Issue warnings |
|
336 |
+ result$warnings <- warlist |
|
337 |
+ |
|
338 |
+ ## Return results |
|
339 |
+ return(result) |
|
340 |
+} |
|
341 |
+ |
|
342 |
+ |
|
343 |
+bivariate.findSCEs <- function(binned.data, ID, eps=0.001, init="standard", max.time=-1, max.iter=-1, num.trials=1, eps.try=NULL, num.threads=1, read.cutoff.quantile=0.999, GC.correction=TRUE, allow.odd.states=FALSE) { |
|
344 |
+ |
|
345 |
+ ## Debugging |
|
346 |
+# ID = 'test' |
|
347 |
+# eps = 1 |
|
348 |
+# init='standard' |
|
349 |
+# max.time=-1 |
|
350 |
+# max.iter=-1 |
|
351 |
+# num.trials=1 |
|
352 |
+# eps.try=NULL |
|
353 |
+# num.threads=1 |
|
354 |
+# read.cutoff.quantile=0.999 |
|
355 |
+# GC.correction=FALSE |
|
356 |
+ |
|
357 |
+ ## Intercept user input |
|
358 |
+ IDcheck <- ID #trigger error if not defined |
|
359 |
+ if (class(binned.data) != 'GRanges') { |
|
360 |
+ binned.data <- get(load(binned.data)) |
|
361 |
+ if (class(binned.data) != 'GRanges') stop("argument 'binned.data' expects a GRanges with meta-column 'reads' or a file that contains such an object") |
|
362 |
+ } |
|
363 |
+ if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric") |
|
364 |
+ if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer") |
|
365 |
+ if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer") |
|
366 |
+ if (check.positive.integer(num.trials)!=0) stop("argument 'num.trials' expects a positive integer") |
|
367 |
+ if (!is.null(eps.try)) { |
|
368 |
+ if (check.positive(eps.try)!=0) stop("argument 'eps.try' expects a positive numeric") |
|
369 |
+ } |
|
370 |
+ if (check.positive.integer(num.threads)!=0) stop("argument 'num.threads' expects a positive integer") |
|
371 |
+ if (check.logical(GC.correction)!=0) stop("argument 'GC.correction' expects a logical (TRUE or FALSE)") |
|
372 |
+ |
|
373 |
+ war <- NULL |
|
374 |
+ if (is.null(eps.try)) eps.try <- eps |
|
375 |
+ |
|
376 |
+ ### Stack the strands and run one univariate findSCEs |
|
377 |
+ message("") |
|
378 |
+ message(paste(rep('-',getOption('width')), collapse='')) |
|
379 |
+ message("Running univariate") |
|
380 |
+ binned.data.minus <- binned.data |
|
381 |
+ strand(binned.data.minus) <- '-' |
|
382 |
+ binned.data.minus$reads <- binned.data.minus$mreads |
|
383 |
+ binned.data.minus$reads.gc <- binned.data.minus$mreads.gc |
|
384 |
+ binned.data.plus <- binned.data |
|
385 |
+ strand(binned.data.plus) <- '+' |
|
386 |
+ binned.data.plus$reads <- binned.data.plus$preads |
|
387 |
+ binned.data.plus$reads.gc <- binned.data.plus$preads.gc |
|
388 |
+ binned.data.stacked <- c(binned.data.minus, binned.data.plus) |
|
389 |
+ mask.attributes <- c(grep('complexity', names(attributes(binned.data)), value=T), 'spikyness', 'shannon.entropy') |
|
390 |
+ attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes] |
|
391 |
+ |
|
392 |
+ model.stacked <- univariate.findSCEs(binned.data.stacked, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff.quantile=read.cutoff.quantile, GC.correction=GC.correction) |
|
393 |
+ model.minus <- model.stacked |
|
394 |
+ model.minus$bins <- model.minus$bins[strand(model.minus$bins)=='-'] |
|
395 |
+ model.minus$segments <- model.minus$segments[strand(model.minus$segments)=='-'] |
|
396 |
+ model.plus <- model.stacked |
|
397 |
+ model.plus$bins <- model.plus$bins[strand(model.plus$bins)=='+'] |
|
398 |
+ model.plus$segments <- model.plus$segments[strand(model.plus$segments)=='+'] |
|
399 |
+ |
|
400 |
+ models <- list(minus=model.minus, plus=model.plus) |
|
401 |
+ |
|
402 |
+ |
|
403 |
+# ### Split into strands and run univariate findSCEs |
|
404 |
+# message("") |
|
405 |
+# message(paste(rep('-',getOption('width')), collapse='')) |
|
406 |
+# message("Running '-'-strand") |
|
407 |
+# minus.model <- univariate.findSCEs(binned.data, ID, eps=eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff.quantile=read.cutoff.quantile, GC.correction=GC.correction, strand='-') |
|
408 |
+# message("") |
|
409 |
+# message(paste(rep('-',getOption('width')), collapse='')) |
|
410 |
+# message("Running '+'-strand") |
|
411 |
+# plus.model <- univariate.findSCEs(binned.data, ID, eps=10*eps, init=init, max.time=max.time, max.iter=max.iter, num.trials=num.trials, eps.try=eps.try, num.threads=num.threads, read.cutoff.quantile=read.cutoff.quantile, GC.correction=GC.correction, strand='+') |
|
412 |
+# models <- list(plus=plus.model, minus=minus.model) |
|
413 |
+ |
|
414 |
+ ### Prepare the multivariate HMM |
|
415 |
+ message("") |
|
416 |
+ message(paste(rep('-',getOption('width')), collapse='')) |
|
417 |
+ message("Preparing bivariate HMM\n") |
|
418 |
+ |
|
419 |
+ ## Extract reads and other stuff |
|
420 |
+ distributions <- lapply(models, '[[', 'distributions') |
|
421 |
+ weights <- lapply(models, '[[', 'weights') |
|
422 |
+ num.uni.states <- length(models[[1]]$weights) |
|
423 |
+ num.models <- length(models) |
|
424 |
+ num.bins <- length(models[[1]]$bins) |
|
425 |
+ comb.states <- vector() |
|
426 |
+ levels.state <- levels(models[[1]]$bins$state) |
|
427 |
+ for (i1 in 1:length(levels.state)) { |
|
428 |
+ for (i2 in 1:length(levels.state)) { |
|
429 |
+ comb.state <- paste(levels.state[i1], levels.state[i2]) |
|
430 |
+ if (allow.odd.states) { |
|
431 |
+ comb.states[length(comb.states)+1] <- comb.state |
|
432 |
+ } else { |
|
433 |
+ state.multiplicity <- multiplicity.SCE[levels.state[i1]] + multiplicity.SCE[levels.state[i2]] |
|
434 |
+ # Only even states and null-mono states (for sex chromosomes) |
|
435 |
+ if (state.multiplicity %% 2 == 0 | state.multiplicity == 1) { |
|
436 |
+ comb.states[length(comb.states)+1] <- comb.state |
|
437 |
+ } |
|
438 |
+ } |
|
439 |
+ } |
|
440 |
+ } |
|
441 |
+ comb.states <- factor(comb.states, levels=comb.states) |
|
442 |
+ states.list <- list() |
|
443 |
+ for (model in models) { states.list[[length(states.list)+1]] <- model$bins$state } |
|
444 |
+ comb.states.per.bin <- factor(do.call(paste, states.list), levels=levels(comb.states)) |
|
445 |
+ num.comb.states <- length(comb.states) |
|
446 |
+ select <- 'reads' |
|
447 |
+ if (GC.correction) { |
|
448 |
+ if (paste0(select,'.gc') %in% names(mcols(binned.data))) { |
|
449 |
+ select <- paste0(select,'.gc') |
|
450 |
+ } else { |
|
451 |
+ warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because they are not in the binned data. Continuing without GC-correction.")) |
|
452 |
+ } |
|
453 |
+ } |
|
454 |
+ reads <- matrix(c(mcols(models$minus$bins)[,paste0('m',select)], mcols(models$plus$bins)[,paste0('p',select)]), ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models))) |
|
455 |
+ maxreads <- max(reads) |
|
456 |
+ |
|
457 |
+ ## Pre-compute z-values for each number of reads |
|
458 |
+ message("Computing pre z-matrix...", appendLF=F) |
|
459 |
+ z.per.read <- array(NA, dim=c(maxreads+1, num.models, num.uni.states), dimnames=list(reads=0:maxreads, strand=names(models), state=levels(models[[1]]$bins$state))) |
|
460 |
+ xreads <- 0:maxreads |
|
461 |
+ for (istrand in 1:num.models) { |
|
462 |
+ model <- models[[istrand]] |
|
463 |
+ for (istate in 1:num.uni.states) { |
|
464 |
+ if (model$distributions[istate,'type']=='dnbinom') { |
|
465 |
+ size <- model$distributions[istate,'size'] |
|
466 |
+ prob <- model$distributions[istate,'prob'] |
|
467 |
+ u <- pnbinom(xreads, size, prob) |
|
468 |
+ } else if (model$distributions[istate,'type']=='delta') { |
|
469 |
+ u <- rep(1, length(xreads)) |
|
470 |
+ } else if (model$distributions[istate,'type']=='dgeom') { |
|
471 |
+ prob <- model$distributions[istate,'prob'] |
|
472 |
+ u <- pgeom(xreads, prob) |
|
473 |
+ } |
|
474 |
+ qnorm_u <- qnorm(u) |
|
475 |
+ mask <- qnorm_u==Inf |
|
476 |
+ qnorm_u[mask] <- qnorm(1-1e-16) |
|
477 |
+ z.per.read[ , istrand, istate] <- qnorm_u |
|
478 |
+ } |
|
479 |
+ } |
|
480 |
+ message(" done") |
|
481 |
+ |
|
482 |
+ ## Compute the z matrix |
|
483 |
+ message("Transfering values into z-matrix...", appendLF=F) |
|
484 |
+ z.per.bin = array(NA, dim=c(num.bins, num.models, num.uni.states), dimnames=list(bin=1:num.bins, strand=names(models), state=levels(models[[1]]$bins$state))) |
|
485 |
+ for (istrand in 1:num.models) { |
|
486 |
+ model <- models[[istrand]] |
|
487 |
+ for (istate in 1:num.uni.states) { |
|
488 |
+ z.per.bin[ , istrand, istate] = z.per.read[reads[,istrand]+1, istrand, istate] |
|
489 |
+ } |
|
490 |
+ } |
|
491 |
+ remove(z.per.read) |
|
492 |
+ message(" done") |
|
493 |
+ |
|
494 |
+ ## Calculate correlation matrix |
|
495 |
+ message("Computing inverse of correlation matrix...", appendLF=F) |
|
496 |
+ correlationMatrix <- array(0, dim=c(num.models,num.models,num.comb.states), dimnames=list(strand=names(models), strand=names(models), comb.state=comb.states)) |
|
497 |
+ correlationMatrixInverse <- correlationMatrix |
|
498 |
+ determinant <- rep(0, num.comb.states) |
|
499 |
+ names(determinant) <- comb.states |
|
500 |
+ usestateTF <- rep(NA, num.comb.states) # TRUE, FALSE vector for usable states |
|
501 |
+ names(usestateTF) <- comb.states |
|
502 |
+ for (comb.state in comb.states) { |
|
503 |
+ state <- strsplit(comb.state, ' ')[[1]] |
|
504 |
+ mask <- which(comb.states.per.bin==comb.state) |
|
505 |
+ # Subselect z |
|
506 |
+ z.temp <- matrix(NA, ncol=num.models, nrow=length(mask)) |
|
507 |
+ for (istrand in 1:num.models) { |
|
508 |
+ z.temp[,istrand] <- z.per.bin[mask, istrand, state[istrand]] |
|
509 |
+ } |
|
510 |
+ temp <- tryCatch({ |
|
511 |
+ if (nrow(z.temp) > 1) { |
|
512 |
+ correlationMatrix[,,comb.state] <- cor(z.temp) |
|
513 |
+ determinant[comb.state] <- det( correlationMatrix[,,comb.state] ) |
|
514 |
+ correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state]) |
|
515 |
+ usestateTF[comb.state] <- TRUE |
|
516 |
+ } else { |
|
517 |
+ correlationMatrix[,,comb.state] <- diag(num.models) |
|
518 |
+ determinant[comb.state] <- det( correlationMatrix[,,comb.state] ) |
|
519 |
+ correlationMatrixInverse[,,comb.state] <- solve(correlationMatrix[,,comb.state]) |
|
520 |
+ usestateTF[comb.state] <- TRUE |
|
521 |
+ } |
|
522 |
+ }, warning = function(war) { |
|
523 |
+ correlationMatrix[,,comb.state] <<- diag(num.models) |
|
524 |
+ determinant[comb.state] <<- det( correlationMatrix[,,comb.state] ) |
|
525 |
+ correlationMatrixInverse[,,comb.state] <<- solve(correlationMatrix[,,comb.state]) |
|
526 |
+ usestateTF[comb.state] <<- TRUE |
|
527 |
+ war |
|
528 |
+ }, error = function(err) { |
|
529 |
+ correlationMatrix[,,comb.state] <<- diag(num.models) |
|
530 |
+ determinant[comb.state] <<- det( correlationMatrix[,,comb.state] ) |
|
531 |
+ correlationMatrixInverse[,,comb.state] <<- solve(correlationMatrix[,,comb.state]) |
|
532 |
+ usestateTF[comb.state] <<- TRUE |
|
533 |
+ err |
|
534 |
+ }) |
|
535 |
+ } |
|
536 |
+ message(" done") |
|
537 |
+ |
|
538 |
+ # Use only states with valid correlationMatrixInverse |
|
539 |
+ correlationMatrix = correlationMatrix[,,usestateTF] |
|
540 |
+ correlationMatrixInverse = correlationMatrixInverse[,,usestateTF] |
|
541 |
+ comb.states = comb.states[usestateTF] |
|
542 |
+ comb.states <- droplevels(comb.states) |
|
543 |
+ determinant = determinant[usestateTF] |
|
544 |
+ num.comb.states <- length(comb.states) |
|
545 |
+ |
|
546 |
+ ## Calculate multivariate densities for each state |
|
547 |
+ message("Calculating multivariate densities...", appendLF=F) |
|
548 |
+ densities <- matrix(1, ncol=num.comb.states, nrow=num.bins, dimnames=list(bin=1:num.bins, comb.state=comb.states)) |
|
549 |
+ for (comb.state in comb.states) { |
|
550 |
+ istate <- which(comb.state==comb.states) |
|
551 |
+ state <- strsplit(comb.state, ' ')[[1]] |
|
552 |
+ z.temp <- matrix(NA, ncol=num.models, nrow=num.bins) |
|
553 |
+ product <- 1 |
|
554 |
+ for (istrand in 1:num.models) { |
|
555 |
+ z.temp[,istrand] <- z.per.bin[, istrand, state[istrand]] |
|
556 |
+ if (models[[istrand]]$distributions[state[istrand],'type'] == 'dnbinom') { |
|
557 |
+ size <- models[[istrand]]$distributions[state[istrand],'size'] |
|
558 |
+ prob <- models[[istrand]]$distributions[state[istrand],'prob'] |
|
559 |
+ product <- product * dnbinom(reads[,istrand], size, prob) |
|
560 |
+ } else if (models[[istrand]]$distributions[state[istrand],'type'] == 'dgeom') { |
|
561 |
+ prob <- models[[istrand]]$distributions[state[istrand],'prob'] |
|
562 |
+ product <- product * dgeom(reads[,istrand], prob) |
|
563 |
+ } else if (models[[istrand]]$distributions[state[istrand],'type'] == 'delta') { |
|
564 |
+ product <- product * ifelse(reads[,istrand]==0, 1, 0) |
|
565 |
+ } |
|
566 |
+ } |
|
567 |
+ exponent <- -0.5 * apply( ( z.temp %*% (correlationMatrixInverse[ , , istate] - diag(num.models)) ) * z.temp, 1, sum) |
|
568 |
+ densities[,istate] <- product * determinant[istate]^(-0.5) * exp( exponent ) |
|
569 |
+ } |
|
570 |
+ # Check if densities are > 1 |
|
571 |
+# if (any(densities>1)) stop("Densities > 1") |
|
572 |
+# if (any(densities<0)) stop("Densities < 0") |
|
573 |
+ densities[densities>1] <- 1 |
|
574 |
+ densities[densities<0] <- 0 |
|
575 |
+ # Check if densities are 0 everywhere in some bins |
|
576 |
+ check <- which(apply(densities, 1, sum) == 0) |
|
577 |
+ if (length(check)>0) { |
|
578 |
+ if (check[1]==1) { |
|
579 |
+ densities[1,] <- rep(1e-10, ncol(densities)) |
|
580 |
+ check <- check[-1] |
|
581 |
+ } |
|
582 |
+ for (icheck in check) { |
|
583 |
+ densities[icheck,] <- densities[icheck-1,] |
|
584 |
+ } |
|
585 |
+ } |
|
586 |
+ message(" done\n", appendLF=F) |
|
587 |
+ |
|
588 |
+ ### Define cleanup behaviour ### |
|
589 |
+ on.exit(.C("R_multivariate_cleanup", as.integer(num.comb.states))) |
|
590 |
+ |
|
591 |
+ ### Run the multivariate HMM |
|
592 |
+ # Call the C function |
|
593 |
+ use.initial <- FALSE |
|
594 |
+ hmm <- .C("R_multivariate_hmm", |
|
595 |
+ densities = as.double(densities), # double* D |
|
596 |
+ num.bins = as.integer(num.bins), # int* T |
|
597 |
+ num.comb.states = as.integer(num.comb.states), # int* N |
|
598 |
+ num.strands = as.integer(num.models), # int* Nmod |
|
599 |
+ comb.states = as.integer(comb.states), # int* comb_states |
|
600 |
+ num.iterations = as.integer(max.iter), # int* maxiter |
|
601 |
+ time.sec = as.integer(max.time), # double* maxtime |
|
602 |
+ loglik.delta = as.double(eps), # double* eps |
|
603 |
+ states = integer(length=num.bins), # int* states |
|
604 |
+ A = double(length=num.comb.states*num.comb.states), # double* A |
|
605 |
+ proba = double(length=num.comb.states), # double* proba |
|
606 |
+ loglik = double(length=1), # double* loglik |
|
607 |
+ A.initial = double(length=num.comb.states*num.comb.states), # double* initial_A |
|
608 |
+ proba.initial = double(length=num.comb.states), # double* initial_proba |
|
609 |
+ use.initial.params = as.logical(use.initial), # bool* use_initial_params |
|
610 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
611 |
+ error = as.integer(0) # error handling |
|
612 |
+ ) |
|
613 |
+ |
|
614 |
+ ### Check convergence ### |
|
615 |
+ war <- NULL |
|
616 |
+ if (hmm$loglik.delta > eps) { |
|
617 |
+ war <- warning(paste0("ID = ",ID,": HMM did not converge!\n")) |
|
618 |
+ } |
|
619 |
+ if (hmm$error == 1) { |
|
620 |
+ warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'read.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
|
621 |
+ } else if (hmm$error == 2) { |
|
622 |
+ warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library")) |
|
623 |
+ } |
|
624 |
+ |
|
625 |
+ ### Make return object ### |
|
626 |
+ result <- list() |
|
627 |
+ result$ID <- ID |
|
628 |
+ ## Bin coordinates and states |
|
629 |
+ result$bins <- binned.data |
|
630 |
+ result$bins$state <- comb.states[hmm$states] |
|
631 |
+ # Get states as factors in data.frame |
|
632 |
+ matrix.states <- matrix(unlist(strsplit(as.character(result$bins$state), split=' ')), byrow=T, ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models))) |
|
633 |
+ names <- c('mstate','pstate') |
|
634 |
+ for (i1 in 1:num.models) { |
|
635 |
+ mcols(result$bins)[names[i1]] <- factor(matrix.states[,i1], levels=levels(models[[i1]]$bins$state)) |
|
636 |
+ } |
|
637 |
+ ## Segmentation |
|
638 |
+ message("Making segmentation ...", appendLF=F) |
|
639 |
+ ptm <- proc.time() |
|
640 |
+ gr <- result$bins |
|
641 |
+ red.gr.list <- GRangesList() |
|
642 |
+ for (state in comb.states) { |
|
643 |
+ red.gr <- GenomicRanges::reduce(gr[gr$state==state]) |
|
644 |
+ mcols(red.gr)$state <- rep(factor(state, levels=levels(gr$state)),length(red.gr)) |
|
645 |
+ red.gr.list[[length(red.gr.list)+1]] <- red.gr |
|
646 |
+ } |
|
647 |
+ red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list)) |
|
648 |
+ result$segments <- red.gr |
|
649 |
+ seqlengths(result$segments) <- seqlengths(result$bins) |
|
650 |
+ time <- proc.time() - ptm |
|
651 |
+ message(" ",round(time[3],2),"s") |
|
652 |
+ ## Parameters |
|
653 |
+ # Weights |
|
654 |
+ tstates <- table(result$bins$state) |
|
655 |
+ result$weights <- tstates/sum(tstates) |
|
656 |
+ result$weights.univariate <- weights |
|
657 |
+ # Transition matrices |
|
658 |
+ result$transitionProbs <- matrix(hmm$A, ncol=num.comb.states) |
|
659 |
+ colnames(result$transitionProbs) <- comb.states |
|
660 |
+ rownames(result$transitionProbs) <- comb.states |
|
661 |
+ result$transitionProbs.initial <- matrix(hmm$A.initial, ncol=num.comb.states) |
|
662 |
+ colnames(result$transitionProbs.initial) <- comb.states |
|
663 |
+ rownames(result$transitionProbs.initial) <- comb.states |
|
664 |
+ # Initial probs |
|
665 |
+ result$startProbs <- hmm$proba |
|
666 |
+ names(result$startProbs) <- paste0("P(",comb.states,")") |
|
667 |
+ result$startProbs.initial <- hmm$proba.initial |
|
668 |
+ names(result$startProbs.initial) <- paste0("P(",comb.states,")") |
|
669 |
+ # Distributions |
|
670 |
+ result$distributions <- distributions |
|
671 |
+ names(result$distributions) <- names(models) |
|
672 |
+ ## Convergence info |
|
673 |
+ convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec) |
|
674 |
+ result$convergenceInfo <- convergenceInfo |
|
675 |
+ ## Quality info |
|
676 |
+ qualityInfo <- list(shannon.entropy=qc.entropy(reads), spikyness=qc.spikyness(reads), complexity=attr(result$bins, 'complexity.preseqR')) |
|
677 |
+ result$qualityInfo <- qualityInfo |
|
678 |
+ ## Correlation matrices |
|
679 |
+# result$correlation.matrix <- correlationMatrix2use |
|
680 |
+ ## Add class |
|
681 |
+ class(result) <- class.bivariate.hmm |
|
682 |
+ |
|
683 |
+ return(result) |
|
684 |
+ |
|
685 |
+} |
|
686 |
+ |
|
687 |
+ |
|
688 |
+#' Filter segments by minimal size |
|
689 |
+#' |
|
690 |
+#' \code{filterSegments} filters out segments below a specified minimal segment size. This can be useful to get rid of boundary effects from the Hidden Markov approach. |
|
691 |
+#' |
|
692 |
+#' @param model A \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} object. |
|
693 |
+#' @param min.seg.width The minimum segment width in base-pairs. The default \code{NULL} will be twice the bin size of the given \code{model}. |
|
694 |
+#' @return The input \code{model} with adjusted segments. |
|
695 |
+#' @author Aaron Taudt |
|
696 |
+#' @export |
|
697 |
+filterSegments <- function(model, min.seg.width=NULL) { |
|
698 |
+ |
|
699 |
+ if (is.null(min.seg.width)) { |
|
700 |
+ min.seg.width <- 2*width(model$bins)[1] |
|
701 |
+ } |
|
702 |
+ |
|
703 |
+ grl <- GRangesList() |
|
704 |
+ for (chrom in seqlevels(model$segments)) { |
|
705 |
+ gr <- model$segments[seqnames(model$segments)==chrom] |
|
706 |
+ ## At the beginning of each chromosome, assign the state after to segments smaller than min.seg.width |
|
707 |
+ index <- which(width(gr) < min.seg.width) |
|
708 |
+ if (length(index)==0) { |
|
709 |
+ grl[[chrom]] <- gr |
|
710 |
+ next |
|
711 |
+ } |
|
712 |
+ post.index <- index+1 |
|
713 |
+ # Filter consecutive indices |
|
714 |
+ if (length(index)>1) { |
|
715 |
+ for (i1 in (length(index)-1):1) { |
|
716 |
+ if (index[i1]==index[i1+1]-1) { |
|
717 |
+ post.index[i1] <- post.index[i1+1] |
|
718 |
+ } |
|
719 |
+ } |
|
720 |
+ } |
|
721 |
+ # Only beginning of chromosome |
|
722 |
+ mask <- post.index==post.index[1] |
|
723 |
+ post.index <- post.index[mask] |
|
724 |
+ index <- index[mask] |
|
725 |
+ if (index[1]==1) { |
|
726 |
+ gr$state[index] <- gr$state[post.index] |
|
727 |
+ ## Redo segmentation |
|
728 |
+ red.gr.list <- GRangesList() |
|
729 |
+ for (state in levels(gr$state)) { |
|
730 |
+ red.gr <- GenomicRanges::reduce(gr[gr$state==state]) |
|
731 |
+ mcols(red.gr)$state <- rep(factor(state, levels=levels(gr$state)),length(red.gr)) |
|
732 |
+ if (length(red.gr)>0) { |
|
733 |
+ red.gr.list[[length(red.gr.list)+1]] <- red.gr |
|
734 |
+ } |
|
735 |
+ } |
|
736 |
+ red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list)) |
|
737 |
+ gr <- red.gr |
|
738 |
+ seqlengths(gr) <- seqlengths(model$bins) |
|
739 |
+ } |
|
740 |
+ ## For the rest of each chromosome, assign the state before to segments smaller than min.seg.width |
|
741 |
+ index <- which(width(gr) < min.seg.width) |
|
742 |
+ prev.index <- index-1 |
|
743 |
+ # Filter consecutive indices |
|
744 |
+ if (length(index)>1) { |
|
745 |
+ for (i1 in 2:length(index)) { |
|
746 |
+ if (index[i1]==index[i1-1]+1) { |
|
747 |
+ prev.index[i1] <- prev.index[i1-1] |
|
748 |
+ } |
|
749 |
+ } |
|
750 |
+ } |
|
751 |
+ gr$state[index] <- gr$state[prev.index] |
|
752 |
+ |
|
753 |
+ grl[[chrom]] <- gr |
|
754 |
+ } |
|
755 |
+ |
|
756 |
+ ## Redo segmentation |
|
757 |
+ gr <- unlist(grl) |
|
758 |
+ red.gr.list <- GRangesList() |
|
759 |
+ for (state in levels(gr$state)) { |
|
760 |
+ red.gr <- GenomicRanges::reduce(gr[gr$state==state]) |
|
761 |
+ mcols(red.gr)$state <- rep(factor(state, levels=levels(gr$state)),length(red.gr)) |
|
762 |
+ red.gr.list[[length(red.gr.list)+1]] <- red.gr |
|
763 |
+ } |
|
764 |
+ red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list)) |
|
765 |
+ model$segments <- red.gr |
|
766 |
+ seqlengths(model$segments) <- seqlengths(model$bins) |
|
767 |
+ |
|
768 |
+ return(model) |
|
769 |
+} |
|
770 |
+ |
|
771 |
+getSCEcoordinates <- function(model) { |
|
772 |
+ |
|
773 |
+ segments <- model$segments |
|
774 |
+ segments <- segments[segments$state != 'zero-inflation zero-inflation' & segments$state != 'nullsomy nullsomy' & !grepl('multisomy',segments$state)] |
|
775 |
+ segments.split <- split(segments, seqnames(segments)) |
|
776 |
+ sce <- GRanges() |
|
777 |
+ for (chrom in names(segments.split)) { |
|
778 |
+ segments <- segments.split[[chrom]] |
|
779 |
+ if (length(segments)>1) { |
|
780 |
+ balanced.state <- segments$state %in% paste(state.labels.SCE, state.labels.SCE) |
|
781 |
+ for (i1 in 2:length(balanced.state)) { |
|
782 |
+ if(balanced.state[i1] != balanced.state[i1-1]) { |
|
783 |
+ sce <- c(sce, segments[i1]) |
|
784 |
+ } |
|
785 |
+ } |
|
786 |
+ } |
|
787 |
+ } |
|
788 |
+ mcols(sce) <- NULL |
|
789 |
+ |
|
790 |
+ return(sce) |
|
791 |
+} |
|
792 |
+ |
... | ... |
@@ -6,17 +6,27 @@ NULL |
6 | 6 |
# ======================================================= |
7 | 7 |
# Some global variables that can be used in all functions |
8 | 8 |
# ======================================================= |
9 |
-state.labels <- factor(c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), levels=c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")) |
|
10 |
-state.distributions <- factor(c('delta','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dpois','dbinom')) |
|
11 |
-coordinate.names <- c("chrom","start","end") |
|
12 |
-binned.data.names <- c(coordinate.names,"reads") |
|
13 | 9 |
class.univariate.hmm <- "aneuHMM" |
14 | 10 |
class.multivariate.hmm <- "aneuMultiHMM" |
11 |
+class.bivariate.hmm <- "aneuBiHMM" |
|
15 | 12 |
class.hmm.list <- "aneufinder.hmm.list" |
16 |
-state.colors <- c("mapped"="gray68","nullsomy"="gray90","null-mixed"="gray30","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black") |
|
13 |
+ |
|
14 |
+state.labels <- factor(c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), levels=c("nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")) |
|
15 |
+dependent.states.mask <- state.labels %in% c("monosomy","disomy","trisomy","tetrasomy","multisomy") |
|
16 |
+state.distributions <- factor(c('delta','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dbinom')) |
|
17 |
+state.colors <- c("mapped"="gray68","nullsomy"="gray90","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black") |
|
17 | 18 |
get.state.labels <- function() { return(state.labels) } |
18 | 19 |
get.state.colors <- function() { return(state.colors[as.character(state.labels)]) } |
19 | 20 |
|
21 |
+state.labels.SCE <- factor(c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy"), levels=c("zero-inflation","nullsomy","monosomy","disomy","trisomy","tetrasomy","multisomy")) |
|
22 |
+multiplicity.SCE <- c(0,0,1,2,3,4,5) |
|
23 |
+names(multiplicity.SCE) <- state.labels.SCE |
|
24 |
+dependent.states.mask.SCE <- state.labels.SCE %in% c("monosomy","disomy","trisomy","tetrasomy","multisomy") |
|
25 |
+state.distributions.SCE <- factor(c('delta','dgeom','dnbinom','dnbinom','dnbinom','dnbinom','dnbinom'), levels=c('delta','dgeom','dnbinom','dbinom')) |
|
26 |
+state.colors.SCE <- c("mapped"="gray68","zero-inflation"="gray90","nullsomy"="gray30","monosomy"="gold3","disomy"="springgreen3","trisomy"="orangered1","tetrasomy"="orangered4","multisomy"="purple3","total"="black") |
|
27 |
+get.state.labels.SCE <- function() { return(state.labels.SCE) } |
|
28 |
+get.state.colors.SCE <- function() { return(state.colors[as.character(state.labels.SCE)]) } |
|
29 |
+ |
|
20 | 30 |
# ============================================================================ |
21 | 31 |
# Functions for a Negative Binomial to transform (mean,variance)<->(size,prob) |
22 | 32 |
# ============================================================================ |
... | ... |
@@ -65,6 +65,28 @@ plot.aneuHMM <- function(x, type='chromosomes', ...) { |
65 | 65 |
} |
66 | 66 |
} |
67 | 67 |
|
68 |
+#' Plotting function for \code{\link{aneuBiHMM}} objects |
|
69 |
+#' |
|
70 |
+#' Make different types of plots for \code{\link{aneuBiHMM}} objects. |
|
71 |
+#' |
|
72 |
+#' @param x An \code{\link{aneuBiHMM}} object. |
|
73 |
+#' @param type Type of the plot, one of \code{c('chromosomes', 'histogram')}. You can also specify the type with an integer number. |
|
74 |
+#' \describe{ |
|
75 |
+#' \item{\code{chromosomes}}{An overview over all chromosomes with CNV-state.} |
|
76 |
+#' \item{\code{histogram}}{A histogram of binned read counts with fitted mixture distribution.} |
|
77 |
+#' } |
|
78 |
+#' @param ... Additional arguments for the different plot types. |
|
79 |
+#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object. |
|
80 |
+#' @method plot aneuBiHMM |
|
81 |
+#' @export |
|
82 |
+plot.aneuBiHMM <- function(x, type='chromosomes', ...) { |
|
83 |
+ if (type == 'chromosomes' | type==1) { |
|
84 |
+ plotChromosomes(x, both.strands=TRUE, ...) |
|
85 |
+ } else if (type == 'histogram' | type==2) { |
|
86 |
+ plotBivariateHistograms(x, ...) |
|
87 |
+ } |
|
88 |
+} |
|
89 |
+ |
|
68 | 90 |
# ============================================================ |
69 | 91 |
# Plot a read histogram |
70 | 92 |
# ============================================================ |
... | ... |
@@ -75,7 +97,7 @@ plot.aneuHMM <- function(x, type='chromosomes', ...) { |
75 | 97 |
#' @param binned.data A \code{\link{binned.data}} object containing binned read counts in meta-column 'reads'. |
76 | 98 |
#' @param chromosome,start,end Plot the histogram only for the specified chromosome, start and end position. |
77 | 99 |
#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object. |
78 |
-plotBinnedDataHistogram <- function(binned.data, chromosome=NULL, start=NULL, end=NULL) { |
|
100 |
+plotBinnedDataHistogram <- function(binned.data, strand='*', chromosome=NULL, start=NULL, end=NULL) { |
|
79 | 101 |
|
80 | 102 |
# ----------------------------------------- |
81 | 103 |
# Get right x limit |
... | ... |
@@ -109,10 +131,17 @@ plotBinnedDataHistogram <- function(binned.data, chromosome=NULL, start=NULL, en |
109 | 131 |
selectmask <- selectmask & selectend |
110 | 132 |
} |
111 | 133 |
} |
134 |
+ if (strand=='+') { |
|
135 |
+ select <- 'preads' |
|
136 |
+ } else if (strand=='-') { |
|
137 |
+ select <- 'mreads' |
|
138 |
+ } else if (strand=='*') { |
|
139 |
+ select <- 'reads' |
|
140 |
+ } |
|
112 | 141 |
if (length(which(selectmask)) != length(binned.data$reads)) { |
113 |
- reads <- binned.data$reads[selectmask] |
|
142 |
+ reads <- mcols(binned.data)[,select][as.logical(selectmask)] |
|
114 | 143 |
} else { |
115 |
- reads <- binned.data$reads |
|
144 |
+ reads <- mcols(binned.data)[,select] |
|
116 | 145 |
} |
117 | 146 |
|
118 | 147 |
# Find the x limits |
... | ... |
@@ -127,6 +156,57 @@ plotBinnedDataHistogram <- function(binned.data, chromosome=NULL, start=NULL, en |
127 | 156 |
|
128 | 157 |
} |
129 | 158 |
|
159 |
+# ================================================================= |
|
160 |
+# Plot a read histogram with univariate fits for a bivariate HMM |
|
161 |
+# ================================================================= |
|
162 |
+plotBivariateHistograms <- function(bihmm) { |
|
163 |
+ |
|
164 |
+ binned.data <- bihmm$bins |
|
165 |
+ ## Stack the two strands |
|
166 |
+ binned.data.minus <- binned.data |
|
167 |
+ strand(binned.data.minus) <- '-' |
|
168 |
+ binned.data.minus$reads <- binned.data.minus$mreads |
|
169 |
+ binned.data.minus$reads.gc <- binned.data.minus$mreads.gc |
|
170 |
+ binned.data.plus <- binned.data |
|
171 |
+ strand(binned.data.plus) <- '+' |
|
172 |
+ binned.data.plus$reads <- binned.data.plus$preads |
|
173 |
+ binned.data.plus$reads.gc <- binned.data.plus$preads.gc |
|
174 |
+ binned.data.stacked <- c(binned.data.minus, binned.data.plus) |
|
175 |
+ mask.attributes <- c(grep('complexity', names(attributes(binned.data)), value=T), 'spikyness', 'shannon.entropy') |
|
176 |
+ attributes(binned.data.stacked)[mask.attributes] <- attributes(binned.data)[mask.attributes] |
|
177 |
+ |
|
178 |
+ ## Make fake uni.hmm and plot |
|
179 |
+ strand <- 'minus' |
|
180 |
+ uni.hmm <- list() |
|
181 |
+ uni.hmm$ID <- bihmm$ID |
|
182 |
+ uni.hmm$bins <- binned.data.stacked |
|
183 |
+ uni.hmm$bins$state <- uni.hmm$bins$pstate |
|
184 |
+ uni.hmm$bins$pstate <- NULL |
|
185 |
+ uni.hmm$bins$mstate <- NULL |
|
186 |
+ uni.hmm$weights <- bihmm$weights.univariate[[strand]] |
|
187 |
+ uni.hmm$distributions <- bihmm$distributions[[strand]] |
|
188 |
+ class(uni.hmm) <- class.univariate.hmm |
|
189 |
+ ggplts <- plotUnivariateHistogram(uni.hmm, strand='*') |
|
190 |
+ |
|
191 |
+# ## Make fake uni.hmm and plot |
|
192 |
+# ggplts <- list() |
|
193 |
+# for (strand in c('plus','minus')) { |
|
194 |
+# uni.hmm <- list() |
|
195 |
+# uni.hmm$ID <- bihmm$ID |
|
196 |
+# uni.hmm$bins <- bihmm$bins |
|
197 |
+# uni.hmm$bins$state <- uni.hmm$bins$pstate |
|
198 |
+# uni.hmm$bins$pstate <- NULL |
|
199 |
+# uni.hmm$bins$mstate <- NULL |
|
200 |
+# uni.hmm$weights <- bihmm$weights.univariate[[strand]] |
|
201 |
+# uni.hmm$distributions <- bihmm$distributions[[strand]] |
|
202 |
+# class(uni.hmm) <- class.univariate.hmm |
|
203 |
+# ggplts[[strand]] <- plotUnivariateHistogram(uni.hmm, strand=strand) |
|
204 |
+# } |
|
205 |
+ |
|
206 |
+ return(ggplts) |
|
207 |
+ |
|
208 |
+} |
|
209 |
+ |
|
130 | 210 |
# ============================================================ |
131 | 211 |
# Plot a read histogram with univariate fits |
132 | 212 |
# ============================================================ |
... | ... |
@@ -138,7 +218,7 @@ plotBinnedDataHistogram <- function(binned.data, chromosome=NULL, start=NULL, en |
138 | 218 |
#' @param state Plot the histogram only for the specified CNV-state. |
139 | 219 |
#' @param chromosome,start,end Plot the histogram only for the specified chromosome, start and end position. |
140 | 220 |
#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object. |
141 |
-plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NULL, end=NULL) { |
|
221 |
+plotUnivariateHistogram <- function(model, state=NULL, strand='*', chromosome=NULL, start=NULL, end=NULL) { |
|
142 | 222 |
|
143 | 223 |
# ----------------------------------------- |
144 | 224 |
# Get right x limit |
... | ... |
@@ -175,8 +255,19 @@ plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NU |
175 | 255 |
if (!is.null(state)) { |
176 | 256 |
selectmask <- selectmask & model$bins$state==state |
177 | 257 |
} |
178 |
- reads <- model$bins$reads[selectmask] |
|
179 |
- states <- model$bins$state[selectmask] |
|
258 |
+ if (strand=='+' | strand=='plus') { |
|
259 |
+ select <- 'preads' |
|
260 |
+ } else if (strand=='-' | strand=='minus') { |
|
261 |
+ select <- 'mreads' |
|
262 |
+ } else if (strand=='*' | strand=='both') { |
|
263 |
+ select <- 'reads' |
|
264 |
+ } |
|
265 |
+ if (length(which(selectmask)) != length(model$bins$reads)) { |
|
266 |
+ reads <- mcols(model$bins)[,select][as.logical(selectmask)] |
|
267 |
+ } else { |
|
268 |
+ reads <- mcols(model$bins)[,select] |
|
269 |
+ } |
|
270 |
+ states <- model$bins$state[as.logical(selectmask)] |
|
180 | 271 |
if (length(which(selectmask)) != length(model$bins)) { |
181 | 272 |
if (!is.null(state)) { |
182 | 273 |
weights <- rep(NA, length(levels(model$bins$state))) |
... | ... |
@@ -185,6 +276,8 @@ plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NU |
185 | 276 |
weights[istate] <- length(which(states==levels(model$bins$state)[istate==names(weights)])) |
186 | 277 |
} |
187 | 278 |
weights <- weights / length(states) |
279 |
+ } else { |
|
280 |
+ weights <- model$weights |
|
188 | 281 |
} |
189 | 282 |
} else { |
190 | 283 |
if (!is.null(model$weights)) { |
... | ... |
@@ -242,7 +335,7 @@ plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NU |
242 | 335 |
if (is.null(state)) { |
243 | 336 |
ggplt <- ggplt + geom_line(aes_string(x='x', y='density', group='state', col='state'), data=distributions) |
244 | 337 |
} else { |
245 |
- ggplt <- ggplt + geom_line(aes_string(x='x', y='density', group='state', size='state'), data=distributions[distributions$state==state,]) |
|
338 |
+ ggplt <- ggplt + geom_line(aes_string(x='x', y='density', group='state', col='state'), data=distributions[distributions$state==state,]) |
|
246 | 339 |
} |
247 | 340 |
|
248 | 341 |
# Make legend and colors correct |
... | ... |
@@ -268,7 +361,7 @@ plotUnivariateHistogram <- function(model, state=NULL, chromosome=NULL, start=NU |
268 | 361 |
#' @param model A \code{\link{aneuHMM}} object or \code{\link{binned.data}}. |
269 | 362 |
#' @param file A PDF file where the plot will be saved. |
270 | 363 |
#' @return A \code{\link[ggplot2:ggplot]{ggplot}} object or \code{NULL} if a file was specified. |
271 |
-plotChromosomes <- function(model, file=NULL) { |
|
364 |
+plotChromosomes <- function(model, both.strands=FALSE, file=NULL) { |
|
272 | 365 |
|
273 | 366 |
if (class(model)=='GRanges') { |
274 | 367 |
binned.data <- model |
... | ... |
@@ -276,11 +369,11 @@ plotChromosomes <- function(model, file=NULL) { |
276 | 369 |
model$ID <- '' |
277 | 370 |
model$bins <- binned.data |
278 | 371 |
model$qualityInfo <- list(shannon.entropy=qc.entropy(binned.data$reads), spikyness=qc.spikyness(binned.data$reads), complexity=attr(binned.data, 'complexity.preseqR')) |
279 |
- plot.chromosomes.univariate(model, file=file) |
|
372 |
+ plot.chromosomes(model, both.strands=both.strands, file=file) |
|
280 | 373 |
} else if (class(model)==class.univariate.hmm) { |
281 |
- plot.chromosomes.univariate(model, file=file) |
|
282 |
- } else if (class(model)==class.multivariate.hmm) { |
|
283 |
- plot.chromosomes.bivariate(model, file=file) |
|
374 |
+ plot.chromosomes(model, both.strands=both.strands, file=file) |
|
375 |
+ } else if (class(model)==class.bivariate.hmm) { |
|
376 |
+ plot.chromosomes(model, both.strands=both.strands, percentages=FALSE, file=file) |
|
284 | 377 |
} |
285 | 378 |
|
286 | 379 |
} |
... | ... |
@@ -288,7 +381,7 @@ plotChromosomes <- function(model, file=NULL) { |
288 | 381 |
# ------------------------------------------------------------ |
289 | 382 |
# Plot state categorization for all chromosomes |
290 | 383 |
# ------------------------------------------------------------ |
291 |
-plot.chromosomes.univariate <- function(model, file=NULL) { |
|
384 |
+plot.chromosomes <- function(model, both.strands=FALSE, percentages=TRUE, file=NULL) { |
|
292 | 385 |
|
293 | 386 |
## Convert to GRanges |
294 | 387 |
gr <- model$bins |
... | ... |
@@ -297,12 +390,15 @@ plot.chromosomes.univariate <- function(model, file=NULL) { |
297 | 390 |
## Get some variables |
298 | 391 |
num.chroms <- length(levels(seqnames(gr))) |
299 | 392 |
maxseqlength <- max(seqlengths(gr)) |
300 |
- if (!is.null(model$distributions)) { |
|
301 |
- custom.xlim <- model$distributions['disomy','mu'] * 4 |
|
302 |
- } else { |
|
393 |
+# if (!is.null(model$distributions)) { |
|
394 |
+# custom.xlim <- model$distributions['disomy','mu'] * 4 |
|
395 |
+# } else { |
|
303 | 396 |
tab <- table(gr$reads) |
304 | 397 |
tab <- tab[names(tab)!='0'] |
305 | 398 |
custom.xlim <- as.numeric(names(tab)[which.max(tab)]) * 4 |
399 |
+# } |
|
400 |
+ if (both.strands) { |
|
401 |
+ custom.xlim <- custom.xlim / 2 |
|
306 | 402 |
} |
307 | 403 |
|
308 | 404 |
## Setup page |
... | ... |
@@ -324,11 +420,16 @@ plot.chromosomes.univariate <- function(model, file=NULL) { |
324 | 420 |
quality.string <- paste0('complexity = ',round(model$qualityInfo$complexity),', spikyness = ',round(model$qualityInfo$spikyness,2),', entropy = ',round(model$qualityInfo$shannon.entropy,2)) |
325 | 421 |
grid.text(quality.string, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:ncols), gp=gpar(fontsize=fs_x)) |
326 | 422 |
|
423 |
+ ## Get SCE coordinates |
|
424 |
+ if (both.strands) { |
|
425 |
+ scecoords <- getSCEcoordinates(model) |
|
426 |
+ } |
|
427 |
+ |
|
327 | 428 |
## Go through chromosomes and plot |
328 | 429 |
for (i1 in 1:num.chroms) { |
329 | 430 |
# Get the i,j matrix positions of the regions that contain this subplot |
330 | 431 |
matchidx <- as.data.frame(which(layout == i1+nrows.text*ncols, arr.ind = TRUE)) |
331 |
- if (!is.null(grl[[i1]]$state)) { |
|
432 |
+ if (!is.null(grl[[i1]]$state) & percentages) { |
|
332 | 433 |
# Percentage of chromosome in state |
333 | 434 |
tstate <- table(mcols(grl[[i1]])$state) |
334 | 435 |
pstate.all <- tstate / sum(tstate) |
... | ... |
@@ -345,10 +446,21 @@ plot.chromosomes.univariate <- function(model, file=NULL) { |
345 | 446 |
|
346 | 447 |
# Plot the read counts |
347 | 448 |
dfplot <- as.data.frame(grl[[i1]]) |
348 |
- # Set values to great for plotting to limit |
|
449 |
+ # Set values too big for plotting to limit |
|
349 | 450 |
dfplot$reads[dfplot$reads>=custom.xlim] <- custom.xlim |
350 | 451 |
dfplot.points <- dfplot[dfplot$reads>=custom.xlim,] |
351 | 452 |
dfplot.points$reads <- rep(custom.xlim, nrow(dfplot.points)) |
453 |
+ |
|
454 |
+ if (both.strands) { |
|
455 |
+ dfplot$mreads <- - dfplot$mreads # negative minus reads |
|
456 |
+ dfplot$preads[dfplot$preads>=custom.xlim] <- custom.xlim |
|
457 |
+ dfplot$mreads[dfplot$mreads<=-custom.xlim] <- -custom.xlim |
|
458 |
+ dfplot.points.plus <- dfplot[dfplot$preads>=custom.xlim,] |
|
459 |
+ dfplot.points.plus$reads <- rep(custom.xlim, nrow(dfplot.points.plus)) |
|
460 |
+ dfplot.points.minus <- dfplot[dfplot$mreads<=-custom.xlim,] |
|
461 |
+ dfplot.points.minus$reads <- rep(-custom.xlim, nrow(dfplot.points.minus)) |
|
462 |
+ } |
|
463 |
+ |
|
352 | 464 |
empty_theme <- theme(axis.line=element_blank(), |
353 | 465 |
axis.text.x=element_blank(), |
354 | 466 |
axis.text.y=element_blank(), |
... | ... |
@@ -362,106 +474,46 @@ plot.chromosomes.univariate <- function(model, file=NULL) { |
362 | 474 |
panel.grid.minor=element_blank(), |
363 | 475 |
plot.background=element_blank()) |
364 | 476 |
ggplt <- ggplot(dfplot, aes_string(x='start', y='reads')) # data |
365 |
- ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim-0.1*custom.xlim, ymax=-0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20') # chromosome backbone as simple rectangle |
|
366 | 477 |
if (!is.null(grl[[i1]]$state)) { |
367 |
- ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='reads', col='state'), size=0.2) # read count |
|
368 |
- ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes_string(x='start', y='reads', col='state'), size=2, shape=21) # outliers |
|
478 |
+ if (both.strands) { |
|
479 |
+ ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='preads', col='pstate'), size=0.2) # read count |
|
480 |
+ ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='mreads', col='mstate'), size=0.2) # read count |
|
481 |
+ ggplt <- ggplt + geom_point(data=dfplot.points.plus, mapping=aes_string(x='start', y='reads', col='pstate'), size=5, shape=21) # outliers |
|
482 |
+ ggplt <- ggplt + geom_point(data=dfplot.points.minus, mapping=aes_string(x='start', y='reads', col='mstate'), size=5, shape=21) # outliers |
|
483 |
+ } else { |
|
484 |
+ ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='reads', col='state'), size=0.2) # read count |
|
485 |
+ ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes_string(x='start', y='reads', col='state'), size=2, shape=21) # outliers |
|
486 |
+ } |
|
369 | 487 |
ggplt <- ggplt + scale_color_manual(values=state.colors, drop=F) # do not drop levels if not present |
370 | 488 |
} else { |
371 |
- ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='reads'), size=0.2, col='gray20') # read count |
|
372 |
- ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes_string(x='start', y='reads'), size=2, shape=21, col='gray20') # outliers |
|
489 |
+ if (both.strands) { |
|
490 |
+ ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='preads'), size=0.2, col='gray20') # read count |
|
491 |
+ ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='mreads'), size=0.2, col='gray20') # read count |
|
492 |
+ ggplt <- ggplt + geom_point(data=dfplot.points.plus, mapping=aes_string(x='start', y='reads'), size=5, shape=21, col='gray20') # outliers |
|
493 |
+ ggplt <- ggplt + geom_point(data=dfplot.points.minus, mapping=aes_string(x='start', y='reads'), size=5, shape=21, col='gray20') # outliers |
|
494 |
+ } else { |
|
495 |
+ ggplt <- ggplt + geom_linerange(aes_string(ymin=0, ymax='reads'), size=0.2, col='gray20') # read count |
|
496 |
+ ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes_string(x='start', y='reads'), size=2, shape=21, col='gray20') # outliers |
|
497 |
+ } |
|
498 |
+ } |
|
499 |
+ if (both.strands) { |
|
500 |
+ ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim, ymax=0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20') # chromosome backbone as simple rectangle |
|
501 |
+ } else { |
|
502 |
+ ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim-0.1*custom.xlim, ymax=-0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20') # chromosome backbone as simple rectangle |
|
503 |
+ } |
|
504 |
+ if (both.strands) { |
|
505 |
+ dfsce <- as.data.frame(scecoords[seqnames(scecoords)==names(grl)[i1]]) |
|
506 |
+ if (nrow(dfsce)>0) { |
|
507 |
+ ggplt <- ggplt + geom_segment(data=dfsce, aes(x=start, xend=start), y=-custom.xlim, yend=-0.5*custom.xlim, arrow=arrow()) |
|
508 |
+ } |
|
373 | 509 |
} |
374 | 510 |
ggplt <- ggplt + empty_theme # no axes whatsoever |
375 | 511 |
ggplt <- ggplt + ylab(paste0(seqnames(grl[[i1]])[1], "\n", pstring, "\n", pstring2)) # chromosome names |
376 |
- ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-0.6*custom.xlim,custom.xlim) # set x- and y-limits |
|
377 |
- ggplt <- ggplt + coord_flip() |
|
378 |
- suppressWarnings( |
|
379 |
- print(ggplt, vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col)) |
|
380 |
- ) |
|
381 |
- |
|
382 |
- } |
|
383 |
- if (!is.null(file)) { |
|
384 |
- d & |