... | ... |
@@ -32,7 +32,7 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
32 | 32 |
if (check.logical(use.gc.corrected.reads)!=0) stop("argument 'use.gc.corrected.reads' expects a logical (TRUE or FALSE)") |
33 | 33 |
if (check.strand(strand)!=0) stop("argument 'strand' expects either '+', '-' or '*'") |
34 | 34 |
|
35 |
- war <- NULL |
|
35 |
+ warlist <- list() |
|
36 | 36 |
if (is.null(eps.try)) eps.try <- eps |
37 | 37 |
|
38 | 38 |
## Assign variables |
... | ... |
@@ -48,17 +48,26 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
48 | 48 |
select <- 'reads' |
49 | 49 |
} |
50 | 50 |
if (use.gc.corrected.reads) { |
51 |
- if (paste0(select,'.gc') %in% names(mcols(binned.data))) { |
|
52 |
- select <- paste0(select,'.gc') |
|
51 |
+ if (!(paste0(select,'.gc') %in% names(mcols(binned.data)))) { |
|
52 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because they are not in the binned data. Continuing without GC-correction.")) |
|
53 |
+ } else if (any(is.na(mcols(binned.data)[,paste0(select,'.gc')]))) { |
|
54 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because there are NAs. GC-correction may not be reliable. Continuing without GC-correction.")) |
|
53 | 55 |
} else { |
54 |
- warning("Cannot use GC-corrected reads because they are not in the binned data. Continuing without GC-correction.") |
|
56 |
+ select <- paste0(select,'.gc') |
|
55 | 57 |
} |
56 | 58 |
} |
57 | 59 |
reads <- mcols(binned.data)[,select] |
58 | 60 |
|
59 | 61 |
# Check if there are reads in the data, otherwise HMM will blow up |
60 | 62 |
if (!any(reads!=0)) { |
61 |
- stop("All reads in data are zero. No HMM done.") |
|
63 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All reads in data are zero. No HMM done.")) |
|
64 |
+ ## Make return object |
|
65 |
+ result <- list() |
|
66 |
+ class(result) <- class.univariate.hmm |
|
67 |
+ result$ID <- ID |
|
68 |
+ result$bins <- binned.data |
|
69 |
+ result$warnings <- warlist |
|
70 |
+ return(result) |
|
62 | 71 |
} |
63 | 72 |
|
64 | 73 |
# Filter high reads out, makes HMM faster |
... | ... |
@@ -141,7 +150,7 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
141 | 150 |
hmm$eps <- eps.try |
142 | 151 |
if (num.trials > 1) { |
143 | 152 |
if (hmm$loglik.delta > hmm$eps) { |
144 |
- warning("HMM did not converge in trial run ",i_try,"!\n") |
|
153 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge in trial run ",i_try,"!\n")) |
|
145 | 154 |
} |
146 | 155 |
# Store model in list |
147 | 156 |
hmm$reads <- NULL |
... | ... |
@@ -152,6 +161,10 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
152 | 161 |
break |
153 | 162 |
} |
154 | 163 |
init <- 'random' |
164 |
+ } else if (num.trials == 1) { |
|
165 |
+ if (hmm$loglik.delta > eps) { |
|
166 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge!\n")) |
|
167 |
+ } |
|
155 | 168 |
} |
156 | 169 |
} |
157 | 170 |
|
... | ... |
@@ -197,88 +210,89 @@ univariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max |
197 | 210 |
|
198 | 211 |
### Make return object ### |
199 | 212 |
result <- list() |
213 |
+ class(result) <- class.univariate.hmm |
|
200 | 214 |
result$ID <- ID |
201 |
- ## Bin coordinates and states ### |
|
202 | 215 |
result$bins <- binned.data |
203 |
- result$bins$state <- state.labels[hmm$states] |
|
204 |
- ## Segmentation |
|
205 |
- cat("Making segmentation ...") |
|
206 |
- ptm <- proc.time() |
|
207 |
- gr <- result$bins |
|
208 |
- red.gr.list <- GRangesList() |
|
209 |
- for (state in state.labels) { |
|
210 |
- red.gr <- GenomicRanges::reduce(gr[gr$state==state]) |
|
211 |
- mcols(red.gr)$state <- rep(factor(state, levels=levels(state.labels)),length(red.gr)) |
|
212 |
- red.gr.list[[length(red.gr.list)+1]] <- red.gr |
|
213 |
- } |
|
214 |
- red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list)) |
|
215 |
- result$segments <- red.gr |
|
216 |
- seqlengths(result$segments) <- seqlengths(binned.data) |
|
217 |
- time <- proc.time() - ptm |
|
218 |
- cat(paste0(" ",round(time[3],2),"s\n")) |
|
219 |
- ## Parameters |
|
220 |
- # Weights |
|
221 |
- result$weights <- hmm$weights |
|
222 |
- names(result$weights) <- state.labels |
|
223 |
- # Transition matrices |
|
224 |
- transitionProbs <- matrix(hmm$A, ncol=hmm$num.states) |
|
225 |
- rownames(transitionProbs) <- state.labels |
|
226 |
- colnames(transitionProbs) <- state.labels |
|
227 |
- result$transitionProbs <- transitionProbs |
|
228 |
- transitionProbs.initial <- matrix(hmm$A.initial, ncol=hmm$num.states) |
|
229 |
- rownames(transitionProbs.initial) <- state.labels |
|
230 |
- colnames(transitionProbs.initial) <- state.labels |
|
231 |
- result$transitionProbs.initial <- transitionProbs.initial |
|
232 |
- # Initial probs |
|
233 |
- result$startProbs <- hmm$proba |
|
234 |
- names(result$startProbs) <- paste0("P(",state.labels,")") |
|
235 |
- result$startProbs.initial <- hmm$proba.initial |
|
236 |
- names(result$startProbs.initial) <- paste0("P(",state.labels,")") |
|
237 |
- # Distributions |
|
238 |
- distributions <- data.frame() |
|
239 |
- distributions.initial <- data.frame() |
|
240 |
- for (idistr in 1:length(hmm$distr.type)) { |
|
241 |
- distr <- levels(state.distributions)[hmm$distr.type[idistr]] |
|
242 |
- if (distr == 'dnbinom') { |
|
243 |
- distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
244 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
245 |
- } else if (distr == 'dgeom') { |
|
246 |
- distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], lambda=NA, mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr]))) |
|
247 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], lambda=NA, mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr]))) |
|
248 |
- } else if (distr == 'delta') { |
|
249 |
- distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0)) |
|
250 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0)) |
|
251 |
- } else if (distr == 'dpois') { |
|
252 |
- distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda[idistr], mu=hmm$lambda[idistr], variance=hmm$lambda[idistr])) |
|
253 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda.initial[idistr], mu=hmm$lambda.initial[idistr], variance=hmm$lambda.initial[idistr])) |
|
254 |
- } else if (distr == 'dbinom') { |
|
255 |
- distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
256 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
216 |
+ ## Check for errors |
|
217 |
+ if (hmm$error == 0) { |
|
218 |
+ ## Bin coordinates and states ### |
|
219 |
+ result$bins$state <- state.labels[hmm$states] |
|
220 |
+ ## Segmentation |
|
221 |
+ cat("Making segmentation ...") |
|
222 |
+ ptm <- proc.time() |
|
223 |
+ gr <- result$bins |
|
224 |
+ red.gr.list <- GRangesList() |
|
225 |
+ for (state in state.labels) { |
|
226 |
+ red.gr <- GenomicRanges::reduce(gr[gr$state==state]) |
|
227 |
+ mcols(red.gr)$state <- rep(factor(state, levels=levels(state.labels)),length(red.gr)) |
|
228 |
+ if (length(red.gr) > 0) { |
|
229 |
+ red.gr.list[[length(red.gr.list)+1]] <- red.gr |
|
257 | 230 |
} |
258 | 231 |
} |
259 |
- rownames(distributions) <- state.labels |
|
260 |
- rownames(distributions.initial) <- state.labels |
|
261 |
- result$distributions <- distributions |
|
262 |
- result$distributions.initial <- distributions |
|
263 |
- ## Convergence info |
|
264 |
- convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec) |
|
265 |
- result$convergenceInfo <- convergenceInfo |
|
266 |
- ## Add class |
|
267 |
- class(result) <- class.univariate.hmm |
|
268 |
- |
|
269 |
- ### Issue warnings ### |
|
270 |
- if (num.trials == 1) { |
|
271 |
- if (hmm$loglik.delta > eps) { |
|
272 |
- war <- warning("HMM did not converge!\n") |
|
232 |
+ red.gr <- GenomicRanges::sort(GenomicRanges::unlist(red.gr.list)) |
|
233 |
+ result$segments <- red.gr |
|
234 |
+ seqlengths(result$segments) <- seqlengths(binned.data) |
|
235 |
+ time <- proc.time() - ptm |
|
236 |
+ cat(paste0(" ",round(time[3],2),"s\n")) |
|
237 |
+ ## Parameters |
|
238 |
+ # Weights |
|
239 |
+ result$weights <- hmm$weights |
|
240 |
+ names(result$weights) <- state.labels |
|
241 |
+ # Transition matrices |
|
242 |
+ transitionProbs <- matrix(hmm$A, ncol=hmm$num.states) |
|
243 |
+ rownames(transitionProbs) <- state.labels |
|
244 |
+ colnames(transitionProbs) <- state.labels |
|
245 |
+ result$transitionProbs <- transitionProbs |
|
246 |
+ transitionProbs.initial <- matrix(hmm$A.initial, ncol=hmm$num.states) |
|
247 |
+ rownames(transitionProbs.initial) <- state.labels |
|
248 |
+ colnames(transitionProbs.initial) <- state.labels |
|
249 |
+ result$transitionProbs.initial <- transitionProbs.initial |
|
250 |
+ # Initial probs |
|
251 |
+ result$startProbs <- hmm$proba |
|
252 |
+ names(result$startProbs) <- paste0("P(",state.labels,")") |
|
253 |
+ result$startProbs.initial <- hmm$proba.initial |
|
254 |
+ names(result$startProbs.initial) <- paste0("P(",state.labels,")") |
|
255 |
+ # Distributions |
|
256 |
+ distributions <- data.frame() |
|
257 |
+ distributions.initial <- data.frame() |
|
258 |
+ for (idistr in 1:length(hmm$distr.type)) { |
|
259 |
+ distr <- levels(state.distributions)[hmm$distr.type[idistr]] |
|
260 |
+ if (distr == 'dnbinom') { |
|
261 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
262 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
263 |
+ } else if (distr == 'dgeom') { |
|
264 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], lambda=NA, mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr]))) |
|
265 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], lambda=NA, mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr]))) |
|
266 |
+ } else if (distr == 'delta') { |
|
267 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0)) |
|
268 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=NA, mu=0, variance=0)) |
|
269 |
+ } else if (distr == 'dpois') { |
|
270 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda[idistr], mu=hmm$lambda[idistr], variance=hmm$lambda[idistr])) |
|
271 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, lambda=hmm$lambda.initial[idistr], mu=hmm$lambda.initial[idistr], variance=hmm$lambda.initial[idistr])) |
|
272 |
+ } else if (distr == 'dbinom') { |
|
273 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], lambda=NA, mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
274 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], lambda=NA, mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
275 |
+ } |
|
276 |
+ } |
|
277 |
+ rownames(distributions) <- state.labels |
|
278 |
+ rownames(distributions.initial) <- state.labels |
|
279 |
+ result$distributions <- distributions |
|
280 |
+ result$distributions.initial <- distributions |
|
281 |
+ ## Convergence info |
|
282 |
+ convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error) |
|
283 |
+ result$convergenceInfo <- convergenceInfo |
|
284 |
+ ## GC correction |
|
285 |
+ result$gc.correction <- grepl('gc', select) |
|
286 |
+ } else if (hmm$error == 1) { |
|
287 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'read.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
|
288 |
+ } else if (hmm$error == 2) { |
|
289 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library")) |
|
273 | 290 |
} |
274 |
- } |
|
275 |
- if (hmm$error == 1) { |
|
276 |
- stop("A nan occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your read counts for very high numbers, they could be the cause for this problem. Try again with a lower value for 'read.cutoff.quantile'.") |
|
277 |
- } else if (hmm$error == 2) { |
|
278 |
- stop("An error occurred during the Baum-Welch! Parameter estimation terminated prematurely.") |
|
279 |
- } |
|
280 | 291 |
|
281 |
- # Return results |
|
292 |
+ ## Issue warnings |
|
293 |
+ result$warnings <- warlist |
|
294 |
+ |
|
295 |
+ ## Return results |
|
282 | 296 |
return(result) |
283 | 297 |
} |
284 | 298 |
|
... | ... |
@@ -337,7 +351,7 @@ bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max. |
337 | 351 |
if (paste0(select,'.gc') %in% names(mcols(binned.data))) { |
338 | 352 |
select <- paste0(select,'.gc') |
339 | 353 |
} else { |
340 |
- warning("Cannot use GC-corrected reads because they are not in the binned data. Continuing without GC-correction.") |
|
354 |
+ warning(paste0("ID = ",ID,": Cannot use GC-corrected reads because they are not in the binned data. Continuing without GC-correction.")) |
|
341 | 355 |
} |
342 | 356 |
} |
343 | 357 |
reads <- matrix(c(mcols(models$plus$bins)[,paste0('p',select)], mcols(models$minus$bins)[,paste0('m',select)]), ncol=num.models, dimnames=list(bin=1:num.bins, strand=names(models))) |
... | ... |
@@ -486,12 +500,12 @@ bivariate.findCNVs <- function(binned.data, ID, eps=0.001, init="standard", max. |
486 | 500 |
### Check convergence ### |
487 | 501 |
war <- NULL |
488 | 502 |
if (hmm$loglik.delta > eps) { |
489 |
- war <- warning("HMM did not converge!\n") |
|
503 |
+ war <- warning(paste0("ID = ",ID,": HMM did not converge!\n")) |
|
490 | 504 |
} |
491 | 505 |
if (hmm$error == 1) { |
492 |
- stop("A nan occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your read counts for very high numbers, they could be the cause for this problem.") |
|
506 |
+ warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'read.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
|
493 | 507 |
} else if (hmm$error == 2) { |
494 |
- stop("An error occurred during the Baum-Welch! Parameter estimation terminated prematurely.") |
|
508 |
+ warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library")) |
|
495 | 509 |
} |
496 | 510 |
|
497 | 511 |
### Make return object ### |
... | ... |
@@ -45,15 +45,19 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL |
45 | 45 |
} |
46 | 46 |
if (length(which(selectmask)) != length(model$bins$reads)) { |
47 | 47 |
reads <- model$bins$reads[selectmask] |
48 |
- states <- model$bins$state[selectmask] |
|
49 |
- weights <- rep(NA, 3) |
|
50 |
- weights[1] <- length(which(states=="zero-inflation")) |
|
51 |
- weights[2] <- length(which(states=="unmodified")) |
|
52 |
- weights[3] <- length(which(states=="modified")) |
|
53 |
- weights <- weights / length(states) |
|
48 |
+ if (!is.null(model$bins$state)) { |
|
49 |
+ states <- model$bins$state[selectmask] |
|
50 |
+ weights <- rep(NA, 3) |
|
51 |
+ weights[1] <- length(which(states=="zero-inflation")) |
|
52 |
+ weights[2] <- length(which(states=="unmodified")) |
|
53 |
+ weights[3] <- length(which(states=="modified")) |
|
54 |
+ weights <- weights / length(states) |
|
55 |
+ } |
|
54 | 56 |
} else { |
55 | 57 |
reads <- model$bins$reads |
56 |
- weights <- model$weights |
|
58 |
+ if (!is.null(model$weights)) { |
|
59 |
+ weights <- model$weights |
|
60 |
+ } |
|
57 | 61 |
} |
58 | 62 |
|
59 | 63 |
# Find the x limits |
... | ... |
@@ -64,6 +68,9 @@ plot.distribution <- function(model, state=NULL, chrom=NULL, start=NULL, end=NUL |
64 | 68 |
|
65 | 69 |
# Plot the histogram |
66 | 70 |
ggplt <- ggplot(data.frame(reads)) + geom_histogram(aes(x=reads, y=..density..), binwidth=1, color='black', fill='white') + coord_cartesian(xlim=c(0,rightxlim)) + theme_bw() + xlab("read count") |
71 |
+ if (is.null(model$weights)) { |
|
72 |
+ return(ggplt) |
|
73 |
+ } |
|
67 | 74 |
|
68 | 75 |
### Add fits to the histogram |
69 | 76 |
c.state.labels <- as.character(levels(model$bins$state)) |
... | ... |
@@ -144,7 +151,11 @@ plot.chromosomes.univariate <- function(model, file=NULL) { |
144 | 151 |
## Get some variables |
145 | 152 |
num.chroms <- length(levels(seqnames(gr))) |
146 | 153 |
maxseqlength <- max(seqlengths(gr)) |
147 |
- custom.xlim <- model$distributions['monosomy','mu'] * 10 |
|
154 |
+ if (!is.null(model$distributions)) { |
|
155 |
+ custom.xlim <- model$distributions['monosomy','mu'] * 10 |
|
156 |
+ } else { |
|
157 |
+ custom.xlim <- mean(model$bins$reads, trim=0.1) * 5 |
|
158 |
+ } |
|
148 | 159 |
|
149 | 160 |
## Setup page |
150 | 161 |
library(grid) |
... | ... |
@@ -165,15 +176,20 @@ plot.chromosomes.univariate <- function(model, file=NULL) { |
165 | 176 |
# Get the i,j matrix positions of the regions that contain this subplot |
166 | 177 |
matchidx <- as.data.frame(which(layout == i1+ncols, arr.ind = TRUE)) |
167 | 178 |
|
168 |
- # Percentage of chromosome in state |
|
169 |
- tstate <- table(mcols(grl[[i1]])$state) |
|
170 |
- pstate.all <- tstate / sum(tstate) |
|
171 |
- pstate <- round(pstate.all*100)[-1] # without 'nullsomy / unmapped' state |
|
172 |
- pstring <- apply(pstate, 1, function(x) { paste0(": ", x, "%") }) |
|
173 |
- pstring <- paste0(names(pstring), pstring) |
|
174 |
- pstring <- paste(pstring[which.max(pstate)], collapse="\n") |
|
175 |
- pstring2 <- round(pstate.all*100)[1] # only 'nullsomy / unmapped' |
|
176 |
- pstring2 <- paste0(names(pstring2), ": ", pstring2, "%") |
|
179 |
+ if (!is.null(grl[[i1]]$state)) { |
|
180 |
+ # Percentage of chromosome in state |
|
181 |
+ tstate <- table(mcols(grl[[i1]])$state) |
|
182 |
+ pstate.all <- tstate / sum(tstate) |
|
183 |
+ pstate <- round(pstate.all*100)[-1] # without 'nullsomy / unmapped' state |
|
184 |
+ pstring <- apply(pstate, 1, function(x) { paste0(": ", x, "%") }) |
|
185 |
+ pstring <- paste0(names(pstring), pstring) |
|
186 |
+ pstring <- paste(pstring[which.max(pstate)], collapse="\n") |
|
187 |
+ pstring2 <- round(pstate.all*100)[1] # only 'nullsomy / unmapped' |
|
188 |
+ pstring2 <- paste0(names(pstring2), ": ", pstring2, "%") |
|
189 |
+ } else { |
|
190 |
+ pstring <- '' |
|
191 |
+ pstring2 <- '' |
|
192 |
+ } |
|
177 | 193 |
|
178 | 194 |
# Plot the read counts |
179 | 195 |
dfplot <- as.data.frame(grl[[i1]]) |
... | ... |
@@ -194,10 +210,15 @@ plot.chromosomes.univariate <- function(model, file=NULL) { |
194 | 210 |
panel.grid.minor=element_blank(), |
195 | 211 |
plot.background=element_blank()) |
196 | 212 |
ggplt <- ggplot(dfplot, aes(x=start, y=reads)) # data |
197 |
- ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads, col=state), size=0.2) # read count |
|
198 | 213 |
ggplt <- ggplt + geom_rect(ymin=-0.05*custom.xlim-0.1*custom.xlim, ymax=-0.05*custom.xlim, xmin=0, mapping=aes(xmax=max(start)), col='white', fill='gray20') # chromosome backbone as simple rectangle |
199 |
- ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes(x=start, y=reads, col=state), size=5, shape=21) # outliers |
|
200 |
- ggplt <- ggplt + scale_color_manual(values=state.colors, drop=F) # do not drop levels if not present |
|
214 |
+ if (!is.null(grl[[i1]]$state)) { |
|
215 |
+ ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads, col=state), size=0.2) # read count |
|
216 |
+ ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes(x=start, y=reads, col=state), size=5, shape=21) # outliers |
|
217 |
+ ggplt <- ggplt + scale_color_manual(values=state.colors, drop=F) # do not drop levels if not present |
|
218 |
+ } else { |
|
219 |
+ ggplt <- ggplt + geom_linerange(aes(ymin=0, ymax=reads), size=0.2, col='gray20') # read count |
|
220 |
+ ggplt <- ggplt + geom_point(data=dfplot.points, mapping=aes(x=start, y=reads), size=5, shape=21, col='gray20') # outliers |
|
221 |
+ } |
|
201 | 222 |
ggplt <- ggplt + empty_theme # no axes whatsoever |
202 | 223 |
ggplt <- ggplt + ylab(paste0(seqnames(grl[[i1]])[1], "\n", pstring, "\n", pstring2)) # chromosome names |
203 | 224 |
ggplt <- ggplt + xlim(0,maxseqlength) + ylim(-0.6*custom.xlim,custom.xlim) # set x- and y-limits |
... | ... |
@@ -307,7 +328,7 @@ plot.chromosomes.bivariate <- function(model, file=NULL) { |
307 | 328 |
# ------------------------------------------------------------ |
308 | 329 |
# Plot genome overview |
309 | 330 |
# ------------------------------------------------------------ |
310 |
-plot.genome.overview <- function(hmm.list, file, bp.per.cm=5e7, numCPU=1, chromosome=NULL) { |
|
331 |
+plot.genome.overview <- function(hmm.list, file, bp.per.cm=5e7, chromosome=NULL) { |
|
311 | 332 |
|
312 | 333 |
## Function definitions |
313 | 334 |
reformat <- function(x) { |
... | ... |
@@ -325,8 +346,7 @@ plot.genome.overview <- function(hmm.list, file, bp.per.cm=5e7, numCPU=1, chromo |
325 | 346 |
hmm.list <- loadHmmsFromFiles(hmm.list) |
326 | 347 |
|
327 | 348 |
## Load and transform to GRanges |
328 |
- temp <- hmmList2GRangesList(hmm.list, reduce=FALSE, numCPU=numCPU) |
|
329 |
- uni.hmm.grl <- temp$grl |
|
349 |
+ uni.hmm.grl <- lapply(hmm.list, '[[', 'bins') |
|
330 | 350 |
|
331 | 351 |
## Setup page |
332 | 352 |
library(grid) |
... | ... |
@@ -393,7 +413,7 @@ plot.genome.overview <- function(hmm.list, file, bp.per.cm=5e7, numCPU=1, chromo |
393 | 413 |
# ------------------------------------------------------------ |
394 | 414 |
# Plot genome summary |
395 | 415 |
# ------------------------------------------------------------ |
396 |
-plot.genome.summary <- function(hmm.list, file='aneufinder_genome_overview', numCPU=1) { |
|
416 |
+plot.genome.summary <- function(hmm.list, file='aneufinder_genome_overview') { |
|
397 | 417 |
|
398 | 418 |
## Function definitions |
399 | 419 |
reformat <- function(x) { |
... | ... |
@@ -415,9 +435,7 @@ plot.genome.summary <- function(hmm.list, file='aneufinder_genome_overview', num |
415 | 435 |
ymax <- length(hmm.list) |
416 | 436 |
|
417 | 437 |
## Load and transform to GRanges |
418 |
- cat('transforming to GRanges\n') |
|
419 |
- temp <- hmmList2GRangesList(hmm.list, reduce=TRUE, numCPU=numCPU) |
|
420 |
- uni.hmm.grl <- temp$grl |
|
438 |
+ uni.hmm.grl <- lapply(hmm.list, '[[', 'segments') |
|
421 | 439 |
|
422 | 440 |
## Process GRL for plotting |
423 | 441 |
flattened_gr <- flatGrl(uni.hmm.grl) |
... | ... |
@@ -501,22 +519,28 @@ plot.genome.summary <- function(hmm.list, file='aneufinder_genome_overview', num |
501 | 519 |
# ------------------------------------------------------------ |
502 | 520 |
# Plot a heatmap of chromosome state for multiple samples |
503 | 521 |
# ------------------------------------------------------------ |
504 |
-plot.chromosome.heatmap <- function(hmm.list, cluster=TRUE, numCPU=1) { |
|
522 |
+plot.chromosome.heatmap <- function(hmm.list, cluster=TRUE) { |
|
505 | 523 |
|
506 | 524 |
## Load the files |
507 | 525 |
hmm.list <- loadHmmsFromFiles(hmm.list) |
508 | 526 |
|
509 | 527 |
## Transform to GRanges in reduced representation |
510 |
- temp <- hmmList2GRangesList(hmm.list, reduce=T, numCPU=numCPU, consensus=F) |
|
511 |
- grlred <- temp$grl |
|
528 |
+ grlred <- GRangesList() |
|
529 |
+ IDlist <- list() |
|
530 |
+ for (hmm in hmm.list) { |
|
531 |
+ if (!is.null(hmm$segments)) { |
|
532 |
+ grlred[[length(grlred)+1]] <- hmm$segments |
|
533 |
+ IDlist[[length(IDlist)+1]] <- hmm$ID |
|
534 |
+ } |
|
535 |
+ } |
|
512 | 536 |
|
513 | 537 |
## Find the most frequent state (mfs) for each chromosome and sample |
514 | 538 |
cat("finding most frequent state for each sample and chromosome ...") |
515 | 539 |
grl.per.chrom <- lapply(grlred, function(x) { split(x, seqnames(x)) }) |
516 | 540 |
mfs.samples <- list() |
517 |
- for (i1 in 1:length(hmm.list)) { |
|
518 |
- mfs.samples[[hmm.list[[i1]]$ID]] <- lapply(grl.per.chrom[[i1]], function(x) { tab <- aggregate(width(x), by=list(state=x$state), FUN="sum"); tab$state[which.max(tab$x)] }) |
|
519 |
- attr(mfs.samples[[hmm.list[[i1]]$ID]], "varname") <- 'chromosome' |
|
541 |
+ for (i1 in 1:length(grlred)) { |
|
542 |
+ mfs.samples[[IDlist[[i1]]]] <- lapply(grl.per.chrom[[i1]], function(x) { tab <- aggregate(width(x), by=list(state=x$state), FUN="sum"); tab$state[which.max(tab$x)] }) |
|
543 |
+ attr(mfs.samples[[IDlist[[i1]]]], "varname") <- 'chromosome' |
|
520 | 544 |
} |
521 | 545 |
attr(mfs.samples, "varname") <- 'sample' |
522 | 546 |
cat(" done\n") |
... | ... |
@@ -524,7 +548,7 @@ plot.chromosome.heatmap <- function(hmm.list, cluster=TRUE, numCPU=1) { |
524 | 548 |
## Transform to data.frame |
525 | 549 |
# Long format |
526 | 550 |
df <- reshape2::melt(mfs.samples, value.name='state') |
527 |
- df$state <- factor(df$state, levels=levels(hmm.list[[1]]$states)) |
|
551 |
+ df$state <- factor(df$state, levels=levels(hmm.list[[1]]$bins$state)) |
|
528 | 552 |
df$sample <- factor(df$sample, levels=unique(df$sample)) |
529 | 553 |
df$chromosome <- factor(df$chromosome, levels=unique(df$chromosome)) |
530 | 554 |
|
... | ... |
@@ -534,7 +558,7 @@ plot.chromosome.heatmap <- function(hmm.list, cluster=TRUE, numCPU=1) { |
534 | 558 |
df.wide <- reshape2::dcast(df, sample ~ chromosome, value.var='state', factorsAsStrings=F) |
535 | 559 |
# Correct strings to factors |
536 | 560 |
for (col in 2:ncol(df.wide)) { |
537 |
- df.wide[,col] <- factor(df.wide[,col], levels=levels(hmm.list[[1]]$states)) |
|
561 |
+ df.wide[,col] <- factor(df.wide[,col], levels=levels(hmm.list[[1]]$bins$state)) |
|
538 | 562 |
} |
539 | 563 |
# Cluster |
540 | 564 |
hc <- hclust(dist(data.matrix(df.wide[-1]))) |
... | ... |
@@ -542,7 +566,7 @@ plot.chromosome.heatmap <- function(hmm.list, cluster=TRUE, numCPU=1) { |
542 | 566 |
mfs.samples.clustered <- mfs.samples[hc$order] |
543 | 567 |
attr(mfs.samples.clustered, "varname") <- 'sample' |
544 | 568 |
df <- reshape2::melt(mfs.samples.clustered, value.name='state') |
545 |
- df$state <- factor(df$state, levels=levels(hmm.list[[1]]$states)) |
|
569 |
+ df$state <- factor(df$state, levels=levels(hmm.list[[1]]$bins$state)) |
|
546 | 570 |
df$sample <- factor(df$sample, levels=unique(df$sample)) |
547 | 571 |
df$chromosome <- factor(df$chromosome, levels=unique(df$chromosome)) |
548 | 572 |
} |
... | ... |
@@ -556,17 +580,36 @@ plot.chromosome.heatmap <- function(hmm.list, cluster=TRUE, numCPU=1) { |
556 | 580 |
# ------------------------------------------------------------ |
557 | 581 |
# Plot a clustered heatmap of state calls |
558 | 582 |
# ------------------------------------------------------------ |
559 |
-plot.genome.heatmap <- function(hmm.list, file=NULL, cluster=TRUE, numCPU=1) { |
|
583 |
+plot.genome.heatmap <- function(hmm.list, file=NULL, cluster=TRUE) { |
|
560 | 584 |
|
561 | 585 |
## Load the files |
562 | 586 |
hmm.list <- loadHmmsFromFiles(hmm.list) |
563 | 587 |
|
588 |
+ ## Get segments from list |
|
589 |
+ grlred <- GRangesList() |
|
590 |
+ IDlist <- list() |
|
591 |
+ for (hmm in hmm.list) { |
|
592 |
+ if (!is.null(hmm$segments)) { |
|
593 |
+ grlred[[length(grlred)+1]] <- hmm$segments |
|
594 |
+ IDlist[[length(IDlist)+1]] <- hmm$ID |
|
595 |
+ } |
|
596 |
+ } |
|
597 |
+ |
|
598 |
+ ## Clustering |
|
564 | 599 |
if (cluster) { |
565 |
- # Transform to GRanges in reduced representation |
|
566 |
- temp <- hmmList2GRangesList(hmm.list, reduce=TRUE, numCPU=numCPU, consensus=T) |
|
567 |
- grlred <- temp$grl |
|
568 |
- consensus <- temp$consensus |
|
569 |
- constates <- temp$constates |
|
600 |
+ cat("making consensus template ...") |
|
601 |
+ suppressPackageStartupMessages(consensus <- disjoin(unlist(grlred))) |
|
602 |
+ constates <- matrix(NA, ncol=length(grlred), nrow=length(consensus)) |
|
603 |
+ for (i1 in 1:length(grlred)) { |
|
604 |
+ grred <- grlred[[i1]] |
|
605 |
+ splt <- split(grred, mcols(grred)$state) |
|
606 |
+ mind <- as.matrix(findOverlaps(consensus, splt, select='first')) |
|
607 |
+ constates[,i1] <- mind |
|
608 |
+ } |
|
609 |
+ meanstates <- apply(constates, 1, mean, na.rm=T) |
|
610 |
+ mcols(consensus)$meanstate <- meanstates |
|
611 |
+ cat(" done\n") |
|
612 |
+ |
|
570 | 613 |
# Distance measure |
571 | 614 |
cat("clustering ...") |
572 | 615 |
constates[is.na(constates)] <- 0 |
... | ... |
@@ -576,11 +619,8 @@ plot.genome.heatmap <- function(hmm.list, file=NULL, cluster=TRUE, numCPU=1) { |
576 | 619 |
hc <- hclust(dist) |
577 | 620 |
# Reorder samples |
578 | 621 |
grlred <- grlred[hc$order] |
622 |
+ IDlist <- IDlist[hc$order] |
|
579 | 623 |
cat(" done\n") |
580 |
- } else { |
|
581 |
- # Transform to GRanges in reduced representation |
|
582 |
- temp <- hmmList2GRangesList(hmm.list, reduce=TRUE, numCPU=numCPU, consensus=F) |
|
583 |
- grlred <- temp$grl |
|
584 | 624 |
} |
585 | 625 |
|
586 | 626 |
cat("transforming coordinates ...") |
... | ... |
@@ -601,7 +641,7 @@ plot.genome.heatmap <- function(hmm.list, file=NULL, cluster=TRUE, numCPU=1) { |
601 | 641 |
# Data |
602 | 642 |
df <- list() |
603 | 643 |
for (i1 in 1:length(grlred)) { |
604 |
- df[[length(df)+1]] <- data.frame(start=grlred[[i1]]$start.genome, end=grlred[[i1]]$end.genome, seqnames=seqnames(grlred[[i1]]), sample=hmm.list[[i1]]$ID, state=grlred[[i1]]$state) |
|
644 |
+ df[[length(df)+1]] <- data.frame(start=grlred[[i1]]$start.genome, end=grlred[[i1]]$end.genome, seqnames=seqnames(grlred[[i1]]), sample=IDlist[[i1]], state=grlred[[i1]]$state) |
|
605 | 645 |
} |
606 | 646 |
df <- do.call(rbind, df) |
607 | 647 |
# Chromosome lines |
... | ... |
@@ -13,7 +13,7 @@ void R_univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, d |
13 | 13 |
// Define logging level |
14 | 14 |
// FILE* pFile = fopen("chromStar.log", "w"); |
15 | 15 |
// Output2FILE::Stream() = pFile; |
16 |
- FILELog::ReportingLevel() = FILELog::FromString("ERROR"); |
|
16 |
+ FILELog::ReportingLevel() = FILELog::FromString("NONE"); |
|
17 | 17 |
// FILELog::ReportingLevel() = FILELog::FromString("DEBUG2"); |
18 | 18 |
|
19 | 19 |
// // Parallelization settings |
... | ... |
@@ -8,7 +8,7 @@ |
8 | 8 |
|
9 | 9 |
// inline std::string NowTime(); |
10 | 10 |
|
11 |
-enum TLogLevel {logERROR, logWARNING, logINFO, logITERATION, logDEBUG, logDEBUG1, logDEBUG2, logDEBUG3, logDEBUG4}; |
|
11 |
+enum TLogLevel {logNONE, logERROR, logWARNING, logINFO, logITERATION, logDEBUG, logDEBUG1, logDEBUG2, logDEBUG3, logDEBUG4}; |
|
12 | 12 |
|
13 | 13 |
template <typename T> |
14 | 14 |
class Log |
... | ... |
@@ -62,7 +62,7 @@ TLogLevel& Log<T>::ReportingLevel() |
62 | 62 |
template <typename T> |
63 | 63 |
std::string Log<T>::ToString(TLogLevel level) |
64 | 64 |
{ |
65 |
- static const char* const buffer[] = {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"}; |
|
65 |
+ static const char* const buffer[] = {"NONE", "ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"}; |
|
66 | 66 |
return buffer[level]; |
67 | 67 |
} |
68 | 68 |
|
... | ... |
@@ -87,6 +87,8 @@ TLogLevel Log<T>::FromString(const std::string& level) |
87 | 87 |
return logWARNING; |
88 | 88 |
if (level == "ERROR") |
89 | 89 |
return logERROR; |
90 |
+ if (level == "NONE") |
|
91 |
+ return logNONE; |
|
90 | 92 |
Log<T>().Get(logWARNING) << "Unknown logging level '" << level << "'. Using INFO level as default."; |
91 | 93 |
return logINFO; |
92 | 94 |
} |